Index: head/sys/dev/drm2/ttm/ttm_bo.c =================================================================== --- head/sys/dev/drm2/ttm/ttm_bo.c (revision 292468) +++ head/sys/dev/drm2/ttm/ttm_bo.c (revision 292469) @@ -1,1895 +1,1895 @@ /************************************************************************** * * Copyright (c) 2006-2009 VMware, Inc., Palo Alto, CA., USA * All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the * "Software"), to deal in the Software without restriction, including * without limitation the rights to use, copy, modify, merge, publish, * distribute, sub license, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice (including the * next paragraph) shall be included in all copies or substantial portions * of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE * USE OR OTHER DEALINGS IN THE SOFTWARE. * **************************************************************************/ /* * Authors: Thomas Hellstrom */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #define TTM_ASSERT_LOCKED(param) #define TTM_DEBUG(fmt, arg...) #define TTM_BO_HASH_ORDER 13 static int ttm_bo_setup_vm(struct ttm_buffer_object *bo); static int ttm_bo_swapout(struct ttm_mem_shrink *shrink); static void ttm_bo_global_kobj_release(struct ttm_bo_global *glob); MALLOC_DEFINE(M_TTM_BO, "ttm_bo", "TTM Buffer Objects"); static inline int ttm_mem_type_from_flags(uint32_t flags, uint32_t *mem_type) { int i; for (i = 0; i <= TTM_PL_PRIV5; i++) if (flags & (1 << i)) { *mem_type = i; return 0; } return -EINVAL; } static void ttm_mem_type_debug(struct ttm_bo_device *bdev, int mem_type) { struct ttm_mem_type_manager *man = &bdev->man[mem_type]; printf(" has_type: %d\n", man->has_type); printf(" use_type: %d\n", man->use_type); printf(" flags: 0x%08X\n", man->flags); printf(" gpu_offset: 0x%08lX\n", man->gpu_offset); printf(" size: %ju\n", (uintmax_t)man->size); printf(" available_caching: 0x%08X\n", man->available_caching); printf(" default_caching: 0x%08X\n", man->default_caching); if (mem_type != TTM_PL_SYSTEM) (*man->func->debug)(man, TTM_PFX); } static void ttm_bo_mem_space_debug(struct ttm_buffer_object *bo, struct ttm_placement *placement) { int i, ret, mem_type; printf("No space for %p (%lu pages, %luK, %luM)\n", bo, bo->mem.num_pages, bo->mem.size >> 10, bo->mem.size >> 20); for (i = 0; i < placement->num_placement; i++) { ret = ttm_mem_type_from_flags(placement->placement[i], &mem_type); if (ret) return; printf(" placement[%d]=0x%08X (%d)\n", i, placement->placement[i], mem_type); ttm_mem_type_debug(bo->bdev, mem_type); } } #if 0 static ssize_t ttm_bo_global_show(struct ttm_bo_global *glob, char *buffer) { return snprintf(buffer, PAGE_SIZE, "%lu\n", (unsigned long) atomic_read(&glob->bo_count)); } #endif static inline uint32_t ttm_bo_type_flags(unsigned type) { return 1 << (type); } static void ttm_bo_release_list(struct ttm_buffer_object *bo) { struct ttm_bo_device *bdev = bo->bdev; size_t acc_size = bo->acc_size; MPASS(atomic_read(&bo->list_kref) == 0); MPASS(atomic_read(&bo->kref) == 0); MPASS(atomic_read(&bo->cpu_writers) == 0); MPASS(bo->sync_obj == NULL); MPASS(bo->mem.mm_node == NULL); MPASS(list_empty(&bo->lru)); MPASS(list_empty(&bo->ddestroy)); if (bo->ttm) ttm_tt_destroy(bo->ttm); atomic_dec(&bo->glob->bo_count); if (bo->destroy) bo->destroy(bo); else { free(bo, M_TTM_BO); } ttm_mem_global_free(bdev->glob->mem_glob, acc_size); } static int ttm_bo_wait_unreserved_locked(struct ttm_buffer_object *bo, bool interruptible) { const char *wmsg; int flags, ret; ret = 0; if (interruptible) { flags = PCATCH; wmsg = "ttbowi"; } else { flags = 0; wmsg = "ttbowu"; } while (ttm_bo_is_reserved(bo)) { ret = -msleep(bo, &bo->glob->lru_lock, flags, wmsg, 0); if (ret == -EINTR || ret == -ERESTART) ret = -ERESTARTSYS; if (ret != 0) break; } return (ret); } void ttm_bo_add_to_lru(struct ttm_buffer_object *bo) { struct ttm_bo_device *bdev = bo->bdev; struct ttm_mem_type_manager *man; MPASS(ttm_bo_is_reserved(bo)); if (!(bo->mem.placement & TTM_PL_FLAG_NO_EVICT)) { MPASS(list_empty(&bo->lru)); man = &bdev->man[bo->mem.mem_type]; list_add_tail(&bo->lru, &man->lru); refcount_acquire(&bo->list_kref); if (bo->ttm != NULL) { list_add_tail(&bo->swap, &bo->glob->swap_lru); refcount_acquire(&bo->list_kref); } } } int ttm_bo_del_from_lru(struct ttm_buffer_object *bo) { int put_count = 0; if (!list_empty(&bo->swap)) { list_del_init(&bo->swap); ++put_count; } if (!list_empty(&bo->lru)) { list_del_init(&bo->lru); ++put_count; } /* * TODO: Add a driver hook to delete from * driver-specific LRU's here. */ return put_count; } int ttm_bo_reserve_nolru(struct ttm_buffer_object *bo, bool interruptible, bool no_wait, bool use_sequence, uint32_t sequence) { int ret; while (unlikely(atomic_xchg(&bo->reserved, 1) != 0)) { /** * Deadlock avoidance for multi-bo reserving. */ if (use_sequence && bo->seq_valid) { /** * We've already reserved this one. */ if (unlikely(sequence == bo->val_seq)) return -EDEADLK; /** * Already reserved by a thread that will not back * off for us. We need to back off. */ if (unlikely(sequence - bo->val_seq < (1U << 31))) return -EAGAIN; } if (no_wait) return -EBUSY; ret = ttm_bo_wait_unreserved_locked(bo, interruptible); if (unlikely(ret)) return ret; } if (use_sequence) { bool wake_up = false; /** * Wake up waiters that may need to recheck for deadlock, * if we decreased the sequence number. */ if (unlikely((bo->val_seq - sequence < (1U << 31)) || !bo->seq_valid)) wake_up = true; /* * In the worst case with memory ordering these values can be * seen in the wrong order. However since we call wake_up_all * in that case, this will hopefully not pose a problem, * and the worst case would only cause someone to accidentally * hit -EAGAIN in ttm_bo_reserve when they see old value of * val_seq. However this would only happen if seq_valid was * written before val_seq was, and just means some slightly * increased cpu usage */ bo->val_seq = sequence; bo->seq_valid = true; if (wake_up) wakeup(bo); } else { bo->seq_valid = false; } return 0; } void ttm_bo_list_ref_sub(struct ttm_buffer_object *bo, int count, bool never_free) { u_int old; old = atomic_fetchadd_int(&bo->list_kref, -count); if (old <= count) { if (never_free) panic("ttm_bo_ref_buf"); ttm_bo_release_list(bo); } } int ttm_bo_reserve(struct ttm_buffer_object *bo, bool interruptible, bool no_wait, bool use_sequence, uint32_t sequence) { struct ttm_bo_global *glob = bo->glob; int put_count = 0; int ret; mtx_lock(&bo->glob->lru_lock); ret = ttm_bo_reserve_nolru(bo, interruptible, no_wait, use_sequence, sequence); if (likely(ret == 0)) { put_count = ttm_bo_del_from_lru(bo); mtx_unlock(&glob->lru_lock); ttm_bo_list_ref_sub(bo, put_count, true); } else mtx_unlock(&bo->glob->lru_lock); return ret; } int ttm_bo_reserve_slowpath_nolru(struct ttm_buffer_object *bo, bool interruptible, uint32_t sequence) { bool wake_up = false; int ret; while (unlikely(atomic_xchg(&bo->reserved, 1) != 0)) { if (bo->seq_valid && sequence == bo->val_seq) { DRM_ERROR( "%s: bo->seq_valid && sequence == bo->val_seq", __func__); } ret = ttm_bo_wait_unreserved_locked(bo, interruptible); if (unlikely(ret)) return ret; } if ((bo->val_seq - sequence < (1U << 31)) || !bo->seq_valid) wake_up = true; /** * Wake up waiters that may need to recheck for deadlock, * if we decreased the sequence number. */ bo->val_seq = sequence; bo->seq_valid = true; if (wake_up) wakeup(bo); return 0; } int ttm_bo_reserve_slowpath(struct ttm_buffer_object *bo, bool interruptible, uint32_t sequence) { struct ttm_bo_global *glob = bo->glob; int put_count, ret; mtx_lock(&glob->lru_lock); ret = ttm_bo_reserve_slowpath_nolru(bo, interruptible, sequence); if (likely(!ret)) { put_count = ttm_bo_del_from_lru(bo); mtx_unlock(&glob->lru_lock); ttm_bo_list_ref_sub(bo, put_count, true); } else mtx_unlock(&glob->lru_lock); return ret; } void ttm_bo_unreserve_locked(struct ttm_buffer_object *bo) { ttm_bo_add_to_lru(bo); atomic_set(&bo->reserved, 0); wakeup(bo); } void ttm_bo_unreserve(struct ttm_buffer_object *bo) { struct ttm_bo_global *glob = bo->glob; mtx_lock(&glob->lru_lock); ttm_bo_unreserve_locked(bo); mtx_unlock(&glob->lru_lock); } /* * Call bo->mutex locked. */ static int ttm_bo_add_ttm(struct ttm_buffer_object *bo, bool zero_alloc) { struct ttm_bo_device *bdev = bo->bdev; struct ttm_bo_global *glob = bo->glob; int ret = 0; uint32_t page_flags = 0; TTM_ASSERT_LOCKED(&bo->mutex); bo->ttm = NULL; if (bdev->need_dma32) page_flags |= TTM_PAGE_FLAG_DMA32; switch (bo->type) { case ttm_bo_type_device: if (zero_alloc) page_flags |= TTM_PAGE_FLAG_ZERO_ALLOC; case ttm_bo_type_kernel: bo->ttm = bdev->driver->ttm_tt_create(bdev, bo->num_pages << PAGE_SHIFT, page_flags, glob->dummy_read_page); if (unlikely(bo->ttm == NULL)) ret = -ENOMEM; break; case ttm_bo_type_sg: bo->ttm = bdev->driver->ttm_tt_create(bdev, bo->num_pages << PAGE_SHIFT, page_flags | TTM_PAGE_FLAG_SG, glob->dummy_read_page); if (unlikely(bo->ttm == NULL)) { ret = -ENOMEM; break; } bo->ttm->sg = bo->sg; break; default: printf("[TTM] Illegal buffer object type\n"); ret = -EINVAL; break; } return ret; } static int ttm_bo_handle_move_mem(struct ttm_buffer_object *bo, struct ttm_mem_reg *mem, bool evict, bool interruptible, bool no_wait_gpu) { struct ttm_bo_device *bdev = bo->bdev; bool old_is_pci = ttm_mem_reg_is_pci(bdev, &bo->mem); bool new_is_pci = ttm_mem_reg_is_pci(bdev, mem); struct ttm_mem_type_manager *old_man = &bdev->man[bo->mem.mem_type]; struct ttm_mem_type_manager *new_man = &bdev->man[mem->mem_type]; int ret = 0; if (old_is_pci || new_is_pci || ((mem->placement & bo->mem.placement & TTM_PL_MASK_CACHING) == 0)) { ret = ttm_mem_io_lock(old_man, true); if (unlikely(ret != 0)) goto out_err; ttm_bo_unmap_virtual_locked(bo); ttm_mem_io_unlock(old_man); } /* * Create and bind a ttm if required. */ if (!(new_man->flags & TTM_MEMTYPE_FLAG_FIXED)) { if (bo->ttm == NULL) { bool zero = !(old_man->flags & TTM_MEMTYPE_FLAG_FIXED); ret = ttm_bo_add_ttm(bo, zero); if (ret) goto out_err; } ret = ttm_tt_set_placement_caching(bo->ttm, mem->placement); if (ret) goto out_err; if (mem->mem_type != TTM_PL_SYSTEM) { ret = ttm_tt_bind(bo->ttm, mem); if (ret) goto out_err; } if (bo->mem.mem_type == TTM_PL_SYSTEM) { if (bdev->driver->move_notify) bdev->driver->move_notify(bo, mem); bo->mem = *mem; mem->mm_node = NULL; goto moved; } } if (bdev->driver->move_notify) bdev->driver->move_notify(bo, mem); if (!(old_man->flags & TTM_MEMTYPE_FLAG_FIXED) && !(new_man->flags & TTM_MEMTYPE_FLAG_FIXED)) ret = ttm_bo_move_ttm(bo, evict, no_wait_gpu, mem); else if (bdev->driver->move) ret = bdev->driver->move(bo, evict, interruptible, no_wait_gpu, mem); else ret = ttm_bo_move_memcpy(bo, evict, no_wait_gpu, mem); if (ret) { if (bdev->driver->move_notify) { struct ttm_mem_reg tmp_mem = *mem; *mem = bo->mem; bo->mem = tmp_mem; bdev->driver->move_notify(bo, mem); bo->mem = *mem; *mem = tmp_mem; } goto out_err; } moved: if (bo->evicted) { ret = bdev->driver->invalidate_caches(bdev, bo->mem.placement); if (ret) printf("[TTM] Can not flush read caches\n"); bo->evicted = false; } if (bo->mem.mm_node) { bo->offset = (bo->mem.start << PAGE_SHIFT) + bdev->man[bo->mem.mem_type].gpu_offset; bo->cur_placement = bo->mem.placement; } else bo->offset = 0; return 0; out_err: new_man = &bdev->man[bo->mem.mem_type]; if ((new_man->flags & TTM_MEMTYPE_FLAG_FIXED) && bo->ttm) { ttm_tt_unbind(bo->ttm); ttm_tt_destroy(bo->ttm); bo->ttm = NULL; } return ret; } /** * Call bo::reserved. * Will release GPU memory type usage on destruction. * This is the place to put in driver specific hooks to release * driver private resources. * Will release the bo::reserved lock. */ static void ttm_bo_cleanup_memtype_use(struct ttm_buffer_object *bo) { if (bo->bdev->driver->move_notify) bo->bdev->driver->move_notify(bo, NULL); if (bo->ttm) { ttm_tt_unbind(bo->ttm); ttm_tt_destroy(bo->ttm); bo->ttm = NULL; } ttm_bo_mem_put(bo, &bo->mem); atomic_set(&bo->reserved, 0); wakeup(&bo); /* * Since the final reference to this bo may not be dropped by * the current task we have to put a memory barrier here to make * sure the changes done in this function are always visible. * * This function only needs protection against the final kref_put. */ mb(); } static void ttm_bo_cleanup_refs_or_queue(struct ttm_buffer_object *bo) { struct ttm_bo_device *bdev = bo->bdev; struct ttm_bo_global *glob = bo->glob; struct ttm_bo_driver *driver = bdev->driver; void *sync_obj = NULL; int put_count; int ret; mtx_lock(&glob->lru_lock); ret = ttm_bo_reserve_nolru(bo, false, true, false, 0); mtx_lock(&bdev->fence_lock); (void) ttm_bo_wait(bo, false, false, true); if (!ret && !bo->sync_obj) { mtx_unlock(&bdev->fence_lock); put_count = ttm_bo_del_from_lru(bo); mtx_unlock(&glob->lru_lock); ttm_bo_cleanup_memtype_use(bo); ttm_bo_list_ref_sub(bo, put_count, true); return; } if (bo->sync_obj) sync_obj = driver->sync_obj_ref(bo->sync_obj); mtx_unlock(&bdev->fence_lock); if (!ret) { atomic_set(&bo->reserved, 0); wakeup(bo); } refcount_acquire(&bo->list_kref); list_add_tail(&bo->ddestroy, &bdev->ddestroy); mtx_unlock(&glob->lru_lock); if (sync_obj) { driver->sync_obj_flush(sync_obj); driver->sync_obj_unref(&sync_obj); } taskqueue_enqueue_timeout(taskqueue_thread, &bdev->wq, ((hz / 100) < 1) ? 1 : hz / 100); } /** * function ttm_bo_cleanup_refs_and_unlock * If bo idle, remove from delayed- and lru lists, and unref. * If not idle, do nothing. * * Must be called with lru_lock and reservation held, this function * will drop both before returning. * * @interruptible Any sleeps should occur interruptibly. * @no_wait_gpu Never wait for gpu. Return -EBUSY instead. */ static int ttm_bo_cleanup_refs_and_unlock(struct ttm_buffer_object *bo, bool interruptible, bool no_wait_gpu) { struct ttm_bo_device *bdev = bo->bdev; struct ttm_bo_driver *driver = bdev->driver; struct ttm_bo_global *glob = bo->glob; int put_count; int ret; mtx_lock(&bdev->fence_lock); ret = ttm_bo_wait(bo, false, false, true); if (ret && !no_wait_gpu) { void *sync_obj; /* * Take a reference to the fence and unreserve, * at this point the buffer should be dead, so * no new sync objects can be attached. */ sync_obj = driver->sync_obj_ref(bo->sync_obj); mtx_unlock(&bdev->fence_lock); atomic_set(&bo->reserved, 0); wakeup(bo); mtx_unlock(&glob->lru_lock); ret = driver->sync_obj_wait(sync_obj, false, interruptible); driver->sync_obj_unref(&sync_obj); if (ret) return ret; /* * remove sync_obj with ttm_bo_wait, the wait should be * finished, and no new wait object should have been added. */ mtx_lock(&bdev->fence_lock); ret = ttm_bo_wait(bo, false, false, true); mtx_unlock(&bdev->fence_lock); if (ret) return ret; mtx_lock(&glob->lru_lock); ret = ttm_bo_reserve_nolru(bo, false, true, false, 0); /* * We raced, and lost, someone else holds the reservation now, * and is probably busy in ttm_bo_cleanup_memtype_use. * * Even if it's not the case, because we finished waiting any * delayed destruction would succeed, so just return success * here. */ if (ret) { mtx_unlock(&glob->lru_lock); return 0; } } else mtx_unlock(&bdev->fence_lock); if (ret || unlikely(list_empty(&bo->ddestroy))) { atomic_set(&bo->reserved, 0); wakeup(bo); mtx_unlock(&glob->lru_lock); return ret; } put_count = ttm_bo_del_from_lru(bo); list_del_init(&bo->ddestroy); ++put_count; mtx_unlock(&glob->lru_lock); ttm_bo_cleanup_memtype_use(bo); ttm_bo_list_ref_sub(bo, put_count, true); return 0; } /** * Traverse the delayed list, and call ttm_bo_cleanup_refs on all * encountered buffers. */ static int ttm_bo_delayed_delete(struct ttm_bo_device *bdev, bool remove_all) { struct ttm_bo_global *glob = bdev->glob; struct ttm_buffer_object *entry = NULL; int ret = 0; mtx_lock(&glob->lru_lock); if (list_empty(&bdev->ddestroy)) goto out_unlock; entry = list_first_entry(&bdev->ddestroy, struct ttm_buffer_object, ddestroy); refcount_acquire(&entry->list_kref); for (;;) { struct ttm_buffer_object *nentry = NULL; if (entry->ddestroy.next != &bdev->ddestroy) { nentry = list_first_entry(&entry->ddestroy, struct ttm_buffer_object, ddestroy); refcount_acquire(&nentry->list_kref); } ret = ttm_bo_reserve_nolru(entry, false, true, false, 0); if (remove_all && ret) { ret = ttm_bo_reserve_nolru(entry, false, false, false, 0); } if (!ret) ret = ttm_bo_cleanup_refs_and_unlock(entry, false, !remove_all); else mtx_unlock(&glob->lru_lock); if (refcount_release(&entry->list_kref)) ttm_bo_release_list(entry); entry = nentry; if (ret || !entry) goto out; mtx_lock(&glob->lru_lock); if (list_empty(&entry->ddestroy)) break; } out_unlock: mtx_unlock(&glob->lru_lock); out: if (entry && refcount_release(&entry->list_kref)) ttm_bo_release_list(entry); return ret; } static void ttm_bo_delayed_workqueue(void *arg, int pending __unused) { struct ttm_bo_device *bdev = arg; if (ttm_bo_delayed_delete(bdev, false)) { taskqueue_enqueue_timeout(taskqueue_thread, &bdev->wq, ((hz / 100) < 1) ? 1 : hz / 100); } } static void ttm_bo_release(struct ttm_buffer_object *bo) { struct ttm_bo_device *bdev = bo->bdev; struct ttm_mem_type_manager *man = &bdev->man[bo->mem.mem_type]; rw_wlock(&bdev->vm_lock); if (likely(bo->vm_node != NULL)) { RB_REMOVE(ttm_bo_device_buffer_objects, &bdev->addr_space_rb, bo); drm_mm_put_block(bo->vm_node); bo->vm_node = NULL; } rw_wunlock(&bdev->vm_lock); ttm_mem_io_lock(man, false); ttm_mem_io_free_vm(bo); ttm_mem_io_unlock(man); ttm_bo_cleanup_refs_or_queue(bo); if (refcount_release(&bo->list_kref)) ttm_bo_release_list(bo); } void ttm_bo_unref(struct ttm_buffer_object **p_bo) { struct ttm_buffer_object *bo = *p_bo; *p_bo = NULL; if (refcount_release(&bo->kref)) ttm_bo_release(bo); } int ttm_bo_lock_delayed_workqueue(struct ttm_bo_device *bdev) { int pending; if (taskqueue_cancel_timeout(taskqueue_thread, &bdev->wq, &pending)) taskqueue_drain_timeout(taskqueue_thread, &bdev->wq); return (pending); } void ttm_bo_unlock_delayed_workqueue(struct ttm_bo_device *bdev, int resched) { if (resched) { taskqueue_enqueue_timeout(taskqueue_thread, &bdev->wq, ((hz / 100) < 1) ? 1 : hz / 100); } } static int ttm_bo_evict(struct ttm_buffer_object *bo, bool interruptible, bool no_wait_gpu) { struct ttm_bo_device *bdev = bo->bdev; struct ttm_mem_reg evict_mem; struct ttm_placement placement; int ret = 0; mtx_lock(&bdev->fence_lock); ret = ttm_bo_wait(bo, false, interruptible, no_wait_gpu); mtx_unlock(&bdev->fence_lock); if (unlikely(ret != 0)) { if (ret != -ERESTARTSYS) { printf("[TTM] Failed to expire sync object before buffer eviction\n"); } goto out; } MPASS(ttm_bo_is_reserved(bo)); evict_mem = bo->mem; evict_mem.mm_node = NULL; evict_mem.bus.io_reserved_vm = false; evict_mem.bus.io_reserved_count = 0; placement.fpfn = 0; placement.lpfn = 0; placement.num_placement = 0; placement.num_busy_placement = 0; bdev->driver->evict_flags(bo, &placement); ret = ttm_bo_mem_space(bo, &placement, &evict_mem, interruptible, no_wait_gpu); if (ret) { if (ret != -ERESTARTSYS) { printf("[TTM] Failed to find memory space for buffer 0x%p eviction\n", bo); ttm_bo_mem_space_debug(bo, &placement); } goto out; } ret = ttm_bo_handle_move_mem(bo, &evict_mem, true, interruptible, no_wait_gpu); if (ret) { if (ret != -ERESTARTSYS) printf("[TTM] Buffer eviction failed\n"); ttm_bo_mem_put(bo, &evict_mem); goto out; } bo->evicted = true; out: return ret; } static int ttm_mem_evict_first(struct ttm_bo_device *bdev, uint32_t mem_type, bool interruptible, bool no_wait_gpu) { struct ttm_bo_global *glob = bdev->glob; struct ttm_mem_type_manager *man = &bdev->man[mem_type]; struct ttm_buffer_object *bo; int ret = -EBUSY, put_count; mtx_lock(&glob->lru_lock); list_for_each_entry(bo, &man->lru, lru) { ret = ttm_bo_reserve_nolru(bo, false, true, false, 0); if (!ret) break; } if (ret) { mtx_unlock(&glob->lru_lock); return ret; } refcount_acquire(&bo->list_kref); if (!list_empty(&bo->ddestroy)) { ret = ttm_bo_cleanup_refs_and_unlock(bo, interruptible, no_wait_gpu); if (refcount_release(&bo->list_kref)) ttm_bo_release_list(bo); return ret; } put_count = ttm_bo_del_from_lru(bo); mtx_unlock(&glob->lru_lock); MPASS(ret == 0); ttm_bo_list_ref_sub(bo, put_count, true); ret = ttm_bo_evict(bo, interruptible, no_wait_gpu); ttm_bo_unreserve(bo); if (refcount_release(&bo->list_kref)) ttm_bo_release_list(bo); return ret; } void ttm_bo_mem_put(struct ttm_buffer_object *bo, struct ttm_mem_reg *mem) { struct ttm_mem_type_manager *man = &bo->bdev->man[mem->mem_type]; if (mem->mm_node) (*man->func->put_node)(man, mem); } /** * Repeatedly evict memory from the LRU for @mem_type until we create enough * space, or we've evicted everything and there isn't enough space. */ static int ttm_bo_mem_force_space(struct ttm_buffer_object *bo, uint32_t mem_type, struct ttm_placement *placement, struct ttm_mem_reg *mem, bool interruptible, bool no_wait_gpu) { struct ttm_bo_device *bdev = bo->bdev; struct ttm_mem_type_manager *man = &bdev->man[mem_type]; int ret; do { ret = (*man->func->get_node)(man, bo, placement, mem); if (unlikely(ret != 0)) return ret; if (mem->mm_node) break; ret = ttm_mem_evict_first(bdev, mem_type, interruptible, no_wait_gpu); if (unlikely(ret != 0)) return ret; } while (1); if (mem->mm_node == NULL) return -ENOMEM; mem->mem_type = mem_type; return 0; } static uint32_t ttm_bo_select_caching(struct ttm_mem_type_manager *man, uint32_t cur_placement, uint32_t proposed_placement) { uint32_t caching = proposed_placement & TTM_PL_MASK_CACHING; uint32_t result = proposed_placement & ~TTM_PL_MASK_CACHING; /** * Keep current caching if possible. */ if ((cur_placement & caching) != 0) result |= (cur_placement & caching); else if ((man->default_caching & caching) != 0) result |= man->default_caching; else if ((TTM_PL_FLAG_CACHED & caching) != 0) result |= TTM_PL_FLAG_CACHED; else if ((TTM_PL_FLAG_WC & caching) != 0) result |= TTM_PL_FLAG_WC; else if ((TTM_PL_FLAG_UNCACHED & caching) != 0) result |= TTM_PL_FLAG_UNCACHED; return result; } static bool ttm_bo_mt_compatible(struct ttm_mem_type_manager *man, uint32_t mem_type, uint32_t proposed_placement, uint32_t *masked_placement) { uint32_t cur_flags = ttm_bo_type_flags(mem_type); if ((cur_flags & proposed_placement & TTM_PL_MASK_MEM) == 0) return false; if ((proposed_placement & man->available_caching) == 0) return false; cur_flags |= (proposed_placement & man->available_caching); *masked_placement = cur_flags; return true; } /** * Creates space for memory region @mem according to its type. * * This function first searches for free space in compatible memory types in * the priority order defined by the driver. If free space isn't found, then * ttm_bo_mem_force_space is attempted in priority order to evict and find * space. */ int ttm_bo_mem_space(struct ttm_buffer_object *bo, struct ttm_placement *placement, struct ttm_mem_reg *mem, bool interruptible, bool no_wait_gpu) { struct ttm_bo_device *bdev = bo->bdev; struct ttm_mem_type_manager *man; uint32_t mem_type = TTM_PL_SYSTEM; uint32_t cur_flags = 0; bool type_found = false; bool type_ok = false; bool has_erestartsys = false; int i, ret; mem->mm_node = NULL; for (i = 0; i < placement->num_placement; ++i) { ret = ttm_mem_type_from_flags(placement->placement[i], &mem_type); if (ret) return ret; man = &bdev->man[mem_type]; type_ok = ttm_bo_mt_compatible(man, mem_type, placement->placement[i], &cur_flags); if (!type_ok) continue; cur_flags = ttm_bo_select_caching(man, bo->mem.placement, cur_flags); /* * Use the access and other non-mapping-related flag bits from * the memory placement flags to the current flags */ ttm_flag_masked(&cur_flags, placement->placement[i], ~TTM_PL_MASK_MEMTYPE); if (mem_type == TTM_PL_SYSTEM) break; if (man->has_type && man->use_type) { type_found = true; ret = (*man->func->get_node)(man, bo, placement, mem); if (unlikely(ret)) return ret; } if (mem->mm_node) break; } if ((type_ok && (mem_type == TTM_PL_SYSTEM)) || mem->mm_node) { mem->mem_type = mem_type; mem->placement = cur_flags; return 0; } if (!type_found) return -EINVAL; for (i = 0; i < placement->num_busy_placement; ++i) { ret = ttm_mem_type_from_flags(placement->busy_placement[i], &mem_type); if (ret) return ret; man = &bdev->man[mem_type]; if (!man->has_type) continue; if (!ttm_bo_mt_compatible(man, mem_type, placement->busy_placement[i], &cur_flags)) continue; cur_flags = ttm_bo_select_caching(man, bo->mem.placement, cur_flags); /* * Use the access and other non-mapping-related flag bits from * the memory placement flags to the current flags */ ttm_flag_masked(&cur_flags, placement->busy_placement[i], ~TTM_PL_MASK_MEMTYPE); if (mem_type == TTM_PL_SYSTEM) { mem->mem_type = mem_type; mem->placement = cur_flags; mem->mm_node = NULL; return 0; } ret = ttm_bo_mem_force_space(bo, mem_type, placement, mem, interruptible, no_wait_gpu); if (ret == 0 && mem->mm_node) { mem->placement = cur_flags; return 0; } if (ret == -ERESTARTSYS) has_erestartsys = true; } ret = (has_erestartsys) ? -ERESTARTSYS : -ENOMEM; return ret; } static int ttm_bo_move_buffer(struct ttm_buffer_object *bo, struct ttm_placement *placement, bool interruptible, bool no_wait_gpu) { int ret = 0; struct ttm_mem_reg mem; struct ttm_bo_device *bdev = bo->bdev; MPASS(ttm_bo_is_reserved(bo)); /* * FIXME: It's possible to pipeline buffer moves. * Have the driver move function wait for idle when necessary, * instead of doing it here. */ mtx_lock(&bdev->fence_lock); ret = ttm_bo_wait(bo, false, interruptible, no_wait_gpu); mtx_unlock(&bdev->fence_lock); if (ret) return ret; mem.num_pages = bo->num_pages; mem.size = mem.num_pages << PAGE_SHIFT; mem.page_alignment = bo->mem.page_alignment; mem.bus.io_reserved_vm = false; mem.bus.io_reserved_count = 0; /* * Determine where to move the buffer. */ ret = ttm_bo_mem_space(bo, placement, &mem, interruptible, no_wait_gpu); if (ret) goto out_unlock; ret = ttm_bo_handle_move_mem(bo, &mem, false, interruptible, no_wait_gpu); out_unlock: if (ret && mem.mm_node) ttm_bo_mem_put(bo, &mem); return ret; } static int ttm_bo_mem_compat(struct ttm_placement *placement, struct ttm_mem_reg *mem) { int i; if (mem->mm_node && placement->lpfn != 0 && (mem->start < placement->fpfn || mem->start + mem->num_pages > placement->lpfn)) return -1; for (i = 0; i < placement->num_placement; i++) { if ((placement->placement[i] & mem->placement & TTM_PL_MASK_CACHING) && (placement->placement[i] & mem->placement & TTM_PL_MASK_MEM)) return i; } return -1; } int ttm_bo_validate(struct ttm_buffer_object *bo, struct ttm_placement *placement, bool interruptible, bool no_wait_gpu) { int ret; MPASS(ttm_bo_is_reserved(bo)); /* Check that range is valid */ if (placement->lpfn || placement->fpfn) if (placement->fpfn > placement->lpfn || (placement->lpfn - placement->fpfn) < bo->num_pages) return -EINVAL; /* * Check whether we need to move buffer. */ ret = ttm_bo_mem_compat(placement, &bo->mem); if (ret < 0) { ret = ttm_bo_move_buffer(bo, placement, interruptible, no_wait_gpu); if (ret) return ret; } else { /* * Use the access and other non-mapping-related flag bits from * the compatible memory placement flags to the active flags */ ttm_flag_masked(&bo->mem.placement, placement->placement[ret], ~TTM_PL_MASK_MEMTYPE); } /* * We might need to add a TTM. */ if (bo->mem.mem_type == TTM_PL_SYSTEM && bo->ttm == NULL) { ret = ttm_bo_add_ttm(bo, true); if (ret) return ret; } return 0; } int ttm_bo_check_placement(struct ttm_buffer_object *bo, struct ttm_placement *placement) { MPASS(!((placement->fpfn || placement->lpfn) && (bo->mem.num_pages > (placement->lpfn - placement->fpfn)))); return 0; } int ttm_bo_init(struct ttm_bo_device *bdev, struct ttm_buffer_object *bo, unsigned long size, enum ttm_bo_type type, struct ttm_placement *placement, uint32_t page_alignment, bool interruptible, struct vm_object *persistent_swap_storage, size_t acc_size, struct sg_table *sg, void (*destroy) (struct ttm_buffer_object *)) { int ret = 0; unsigned long num_pages; struct ttm_mem_global *mem_glob = bdev->glob->mem_glob; ret = ttm_mem_global_alloc(mem_glob, acc_size, false, false); if (ret) { printf("[TTM] Out of kernel memory\n"); if (destroy) (*destroy)(bo); else free(bo, M_TTM_BO); return -ENOMEM; } num_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; if (num_pages == 0) { printf("[TTM] Illegal buffer object size\n"); if (destroy) (*destroy)(bo); else free(bo, M_TTM_BO); ttm_mem_global_free(mem_glob, acc_size); return -EINVAL; } bo->destroy = destroy; refcount_init(&bo->kref, 1); refcount_init(&bo->list_kref, 1); atomic_set(&bo->cpu_writers, 0); atomic_set(&bo->reserved, 1); INIT_LIST_HEAD(&bo->lru); INIT_LIST_HEAD(&bo->ddestroy); INIT_LIST_HEAD(&bo->swap); INIT_LIST_HEAD(&bo->io_reserve_lru); bo->bdev = bdev; bo->glob = bdev->glob; bo->type = type; bo->num_pages = num_pages; bo->mem.size = num_pages << PAGE_SHIFT; bo->mem.mem_type = TTM_PL_SYSTEM; bo->mem.num_pages = bo->num_pages; bo->mem.mm_node = NULL; bo->mem.page_alignment = page_alignment; bo->mem.bus.io_reserved_vm = false; bo->mem.bus.io_reserved_count = 0; bo->priv_flags = 0; bo->mem.placement = (TTM_PL_FLAG_SYSTEM | TTM_PL_FLAG_CACHED); bo->seq_valid = false; bo->persistent_swap_storage = persistent_swap_storage; bo->acc_size = acc_size; bo->sg = sg; atomic_inc(&bo->glob->bo_count); ret = ttm_bo_check_placement(bo, placement); if (unlikely(ret != 0)) goto out_err; /* * For ttm_bo_type_device buffers, allocate * address space from the device. */ if (bo->type == ttm_bo_type_device || bo->type == ttm_bo_type_sg) { ret = ttm_bo_setup_vm(bo); if (ret) goto out_err; } ret = ttm_bo_validate(bo, placement, interruptible, false); if (ret) goto out_err; ttm_bo_unreserve(bo); return 0; out_err: ttm_bo_unreserve(bo); ttm_bo_unref(&bo); return ret; } size_t ttm_bo_acc_size(struct ttm_bo_device *bdev, unsigned long bo_size, unsigned struct_size) { unsigned npages = (PAGE_ALIGN(bo_size)) >> PAGE_SHIFT; size_t size = 0; size += ttm_round_pot(struct_size); size += PAGE_ALIGN(npages * sizeof(void *)); size += ttm_round_pot(sizeof(struct ttm_tt)); return size; } size_t ttm_bo_dma_acc_size(struct ttm_bo_device *bdev, unsigned long bo_size, unsigned struct_size) { unsigned npages = (PAGE_ALIGN(bo_size)) >> PAGE_SHIFT; size_t size = 0; size += ttm_round_pot(struct_size); size += PAGE_ALIGN(npages * sizeof(void *)); size += PAGE_ALIGN(npages * sizeof(dma_addr_t)); size += ttm_round_pot(sizeof(struct ttm_dma_tt)); return size; } int ttm_bo_create(struct ttm_bo_device *bdev, unsigned long size, enum ttm_bo_type type, struct ttm_placement *placement, uint32_t page_alignment, bool interruptible, struct vm_object *persistent_swap_storage, struct ttm_buffer_object **p_bo) { struct ttm_buffer_object *bo; size_t acc_size; int ret; bo = malloc(sizeof(*bo), M_TTM_BO, M_WAITOK | M_ZERO); acc_size = ttm_bo_acc_size(bdev, size, sizeof(struct ttm_buffer_object)); ret = ttm_bo_init(bdev, bo, size, type, placement, page_alignment, interruptible, persistent_swap_storage, acc_size, NULL, NULL); if (likely(ret == 0)) *p_bo = bo; return ret; } static int ttm_bo_force_list_clean(struct ttm_bo_device *bdev, unsigned mem_type, bool allow_errors) { struct ttm_mem_type_manager *man = &bdev->man[mem_type]; struct ttm_bo_global *glob = bdev->glob; int ret; /* * Can't use standard list traversal since we're unlocking. */ mtx_lock(&glob->lru_lock); while (!list_empty(&man->lru)) { mtx_unlock(&glob->lru_lock); ret = ttm_mem_evict_first(bdev, mem_type, false, false); if (ret) { if (allow_errors) { return ret; } else { printf("[TTM] Cleanup eviction failed\n"); } } mtx_lock(&glob->lru_lock); } mtx_unlock(&glob->lru_lock); return 0; } int ttm_bo_clean_mm(struct ttm_bo_device *bdev, unsigned mem_type) { struct ttm_mem_type_manager *man; int ret = -EINVAL; if (mem_type >= TTM_NUM_MEM_TYPES) { printf("[TTM] Illegal memory type %d\n", mem_type); return ret; } man = &bdev->man[mem_type]; if (!man->has_type) { printf("[TTM] Trying to take down uninitialized memory manager type %u\n", mem_type); return ret; } man->use_type = false; man->has_type = false; ret = 0; if (mem_type > 0) { ttm_bo_force_list_clean(bdev, mem_type, false); ret = (*man->func->takedown)(man); } return ret; } int ttm_bo_evict_mm(struct ttm_bo_device *bdev, unsigned mem_type) { struct ttm_mem_type_manager *man = &bdev->man[mem_type]; if (mem_type == 0 || mem_type >= TTM_NUM_MEM_TYPES) { printf("[TTM] Illegal memory manager memory type %u\n", mem_type); return -EINVAL; } if (!man->has_type) { printf("[TTM] Memory type %u has not been initialized\n", mem_type); return 0; } return ttm_bo_force_list_clean(bdev, mem_type, true); } int ttm_bo_init_mm(struct ttm_bo_device *bdev, unsigned type, unsigned long p_size) { int ret = -EINVAL; struct ttm_mem_type_manager *man; MPASS(type < TTM_NUM_MEM_TYPES); man = &bdev->man[type]; MPASS(!man->has_type); man->io_reserve_fastpath = true; man->use_io_reserve_lru = false; sx_init(&man->io_reserve_mutex, "ttmman"); INIT_LIST_HEAD(&man->io_reserve_lru); ret = bdev->driver->init_mem_type(bdev, type, man); if (ret) return ret; man->bdev = bdev; ret = 0; if (type != TTM_PL_SYSTEM) { ret = (*man->func->init)(man, p_size); if (ret) return ret; } man->has_type = true; man->use_type = true; man->size = p_size; INIT_LIST_HEAD(&man->lru); return 0; } static void ttm_bo_global_kobj_release(struct ttm_bo_global *glob) { ttm_mem_unregister_shrink(glob->mem_glob, &glob->shrink); vm_page_free(glob->dummy_read_page); } void ttm_bo_global_release(struct drm_global_reference *ref) { struct ttm_bo_global *glob = ref->object; if (refcount_release(&glob->kobj_ref)) ttm_bo_global_kobj_release(glob); } int ttm_bo_global_init(struct drm_global_reference *ref) { struct ttm_bo_global_ref *bo_ref = container_of(ref, struct ttm_bo_global_ref, ref); struct ttm_bo_global *glob = ref->object; - int ret; + int req, ret; int tries; sx_init(&glob->device_list_mutex, "ttmdlm"); mtx_init(&glob->lru_lock, "ttmlru", NULL, MTX_DEF); glob->mem_glob = bo_ref->mem_glob; + req = VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ; tries = 0; retry: - glob->dummy_read_page = vm_page_alloc_contig(NULL, 0, - VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ, + glob->dummy_read_page = vm_page_alloc_contig(NULL, 0, req, 1, 0, VM_MAX_ADDRESS, PAGE_SIZE, 0, VM_MEMATTR_UNCACHEABLE); if (unlikely(glob->dummy_read_page == NULL)) { - if (tries < 1) { - vm_pageout_grow_cache(tries, 0, VM_MAX_ADDRESS); + if (tries < 1 && vm_page_reclaim_contig(req, 1, + 0, VM_MAX_ADDRESS, PAGE_SIZE, 0)) { tries++; goto retry; } ret = -ENOMEM; goto out_no_drp; } INIT_LIST_HEAD(&glob->swap_lru); INIT_LIST_HEAD(&glob->device_list); ttm_mem_init_shrink(&glob->shrink, ttm_bo_swapout); ret = ttm_mem_register_shrink(glob->mem_glob, &glob->shrink); if (unlikely(ret != 0)) { printf("[TTM] Could not register buffer object swapout\n"); goto out_no_shrink; } atomic_set(&glob->bo_count, 0); refcount_init(&glob->kobj_ref, 1); return (0); out_no_shrink: vm_page_free(glob->dummy_read_page); out_no_drp: free(glob, M_DRM_GLOBAL); return ret; } int ttm_bo_device_release(struct ttm_bo_device *bdev) { int ret = 0; unsigned i = TTM_NUM_MEM_TYPES; struct ttm_mem_type_manager *man; struct ttm_bo_global *glob = bdev->glob; while (i--) { man = &bdev->man[i]; if (man->has_type) { man->use_type = false; if ((i != TTM_PL_SYSTEM) && ttm_bo_clean_mm(bdev, i)) { ret = -EBUSY; printf("[TTM] DRM memory manager type %d is not clean\n", i); } man->has_type = false; } } sx_xlock(&glob->device_list_mutex); list_del(&bdev->device_list); sx_xunlock(&glob->device_list_mutex); if (taskqueue_cancel_timeout(taskqueue_thread, &bdev->wq, NULL)) taskqueue_drain_timeout(taskqueue_thread, &bdev->wq); while (ttm_bo_delayed_delete(bdev, true)) ; mtx_lock(&glob->lru_lock); if (list_empty(&bdev->ddestroy)) TTM_DEBUG("Delayed destroy list was clean\n"); if (list_empty(&bdev->man[0].lru)) TTM_DEBUG("Swap list was clean\n"); mtx_unlock(&glob->lru_lock); MPASS(drm_mm_clean(&bdev->addr_space_mm)); rw_wlock(&bdev->vm_lock); drm_mm_takedown(&bdev->addr_space_mm); rw_wunlock(&bdev->vm_lock); return ret; } int ttm_bo_device_init(struct ttm_bo_device *bdev, struct ttm_bo_global *glob, struct ttm_bo_driver *driver, uint64_t file_page_offset, bool need_dma32) { int ret = -EINVAL; rw_init(&bdev->vm_lock, "ttmvml"); bdev->driver = driver; memset(bdev->man, 0, sizeof(bdev->man)); /* * Initialize the system memory buffer type. * Other types need to be driver / IOCTL initialized. */ ret = ttm_bo_init_mm(bdev, TTM_PL_SYSTEM, 0); if (unlikely(ret != 0)) goto out_no_sys; RB_INIT(&bdev->addr_space_rb); ret = drm_mm_init(&bdev->addr_space_mm, file_page_offset, 0x10000000); if (unlikely(ret != 0)) goto out_no_addr_mm; TIMEOUT_TASK_INIT(taskqueue_thread, &bdev->wq, 0, ttm_bo_delayed_workqueue, bdev); INIT_LIST_HEAD(&bdev->ddestroy); bdev->dev_mapping = NULL; bdev->glob = glob; bdev->need_dma32 = need_dma32; bdev->val_seq = 0; mtx_init(&bdev->fence_lock, "ttmfence", NULL, MTX_DEF); sx_xlock(&glob->device_list_mutex); list_add_tail(&bdev->device_list, &glob->device_list); sx_xunlock(&glob->device_list_mutex); return 0; out_no_addr_mm: ttm_bo_clean_mm(bdev, 0); out_no_sys: return ret; } /* * buffer object vm functions. */ bool ttm_mem_reg_is_pci(struct ttm_bo_device *bdev, struct ttm_mem_reg *mem) { struct ttm_mem_type_manager *man = &bdev->man[mem->mem_type]; if (!(man->flags & TTM_MEMTYPE_FLAG_FIXED)) { if (mem->mem_type == TTM_PL_SYSTEM) return false; if (man->flags & TTM_MEMTYPE_FLAG_CMA) return false; if (mem->placement & TTM_PL_FLAG_CACHED) return false; } return true; } void ttm_bo_unmap_virtual_locked(struct ttm_buffer_object *bo) { ttm_bo_release_mmap(bo); ttm_mem_io_free_vm(bo); } void ttm_bo_unmap_virtual(struct ttm_buffer_object *bo) { struct ttm_bo_device *bdev = bo->bdev; struct ttm_mem_type_manager *man = &bdev->man[bo->mem.mem_type]; ttm_mem_io_lock(man, false); ttm_bo_unmap_virtual_locked(bo); ttm_mem_io_unlock(man); } static void ttm_bo_vm_insert_rb(struct ttm_buffer_object *bo) { struct ttm_bo_device *bdev = bo->bdev; /* The caller acquired bdev->vm_lock. */ RB_INSERT(ttm_bo_device_buffer_objects, &bdev->addr_space_rb, bo); } /** * ttm_bo_setup_vm: * * @bo: the buffer to allocate address space for * * Allocate address space in the drm device so that applications * can mmap the buffer and access the contents. This only * applies to ttm_bo_type_device objects as others are not * placed in the drm device address space. */ static int ttm_bo_setup_vm(struct ttm_buffer_object *bo) { struct ttm_bo_device *bdev = bo->bdev; int ret; retry_pre_get: ret = drm_mm_pre_get(&bdev->addr_space_mm); if (unlikely(ret != 0)) return ret; rw_wlock(&bdev->vm_lock); bo->vm_node = drm_mm_search_free(&bdev->addr_space_mm, bo->mem.num_pages, 0, 0); if (unlikely(bo->vm_node == NULL)) { ret = -ENOMEM; goto out_unlock; } bo->vm_node = drm_mm_get_block_atomic(bo->vm_node, bo->mem.num_pages, 0); if (unlikely(bo->vm_node == NULL)) { rw_wunlock(&bdev->vm_lock); goto retry_pre_get; } ttm_bo_vm_insert_rb(bo); rw_wunlock(&bdev->vm_lock); bo->addr_space_offset = ((uint64_t) bo->vm_node->start) << PAGE_SHIFT; return 0; out_unlock: rw_wunlock(&bdev->vm_lock); return ret; } int ttm_bo_wait(struct ttm_buffer_object *bo, bool lazy, bool interruptible, bool no_wait) { struct ttm_bo_driver *driver = bo->bdev->driver; struct ttm_bo_device *bdev = bo->bdev; void *sync_obj; int ret = 0; if (likely(bo->sync_obj == NULL)) return 0; while (bo->sync_obj) { if (driver->sync_obj_signaled(bo->sync_obj)) { void *tmp_obj = bo->sync_obj; bo->sync_obj = NULL; clear_bit(TTM_BO_PRIV_FLAG_MOVING, &bo->priv_flags); mtx_unlock(&bdev->fence_lock); driver->sync_obj_unref(&tmp_obj); mtx_lock(&bdev->fence_lock); continue; } if (no_wait) return -EBUSY; sync_obj = driver->sync_obj_ref(bo->sync_obj); mtx_unlock(&bdev->fence_lock); ret = driver->sync_obj_wait(sync_obj, lazy, interruptible); if (unlikely(ret != 0)) { driver->sync_obj_unref(&sync_obj); mtx_lock(&bdev->fence_lock); return ret; } mtx_lock(&bdev->fence_lock); if (likely(bo->sync_obj == sync_obj)) { void *tmp_obj = bo->sync_obj; bo->sync_obj = NULL; clear_bit(TTM_BO_PRIV_FLAG_MOVING, &bo->priv_flags); mtx_unlock(&bdev->fence_lock); driver->sync_obj_unref(&sync_obj); driver->sync_obj_unref(&tmp_obj); mtx_lock(&bdev->fence_lock); } else { mtx_unlock(&bdev->fence_lock); driver->sync_obj_unref(&sync_obj); mtx_lock(&bdev->fence_lock); } } return 0; } int ttm_bo_synccpu_write_grab(struct ttm_buffer_object *bo, bool no_wait) { struct ttm_bo_device *bdev = bo->bdev; int ret = 0; /* * Using ttm_bo_reserve makes sure the lru lists are updated. */ ret = ttm_bo_reserve(bo, true, no_wait, false, 0); if (unlikely(ret != 0)) return ret; mtx_lock(&bdev->fence_lock); ret = ttm_bo_wait(bo, false, true, no_wait); mtx_unlock(&bdev->fence_lock); if (likely(ret == 0)) atomic_inc(&bo->cpu_writers); ttm_bo_unreserve(bo); return ret; } void ttm_bo_synccpu_write_release(struct ttm_buffer_object *bo) { atomic_dec(&bo->cpu_writers); } /** * A buffer object shrink method that tries to swap out the first * buffer object on the bo_global::swap_lru list. */ static int ttm_bo_swapout(struct ttm_mem_shrink *shrink) { struct ttm_bo_global *glob = container_of(shrink, struct ttm_bo_global, shrink); struct ttm_buffer_object *bo; int ret = -EBUSY; int put_count; uint32_t swap_placement = (TTM_PL_FLAG_CACHED | TTM_PL_FLAG_SYSTEM); mtx_lock(&glob->lru_lock); list_for_each_entry(bo, &glob->swap_lru, swap) { ret = ttm_bo_reserve_nolru(bo, false, true, false, 0); if (!ret) break; } if (ret) { mtx_unlock(&glob->lru_lock); return ret; } refcount_acquire(&bo->list_kref); if (!list_empty(&bo->ddestroy)) { ret = ttm_bo_cleanup_refs_and_unlock(bo, false, false); if (refcount_release(&bo->list_kref)) ttm_bo_release_list(bo); return ret; } put_count = ttm_bo_del_from_lru(bo); mtx_unlock(&glob->lru_lock); ttm_bo_list_ref_sub(bo, put_count, true); /** * Wait for GPU, then move to system cached. */ mtx_lock(&bo->bdev->fence_lock); ret = ttm_bo_wait(bo, false, false, false); mtx_unlock(&bo->bdev->fence_lock); if (unlikely(ret != 0)) goto out; if ((bo->mem.placement & swap_placement) != swap_placement) { struct ttm_mem_reg evict_mem; evict_mem = bo->mem; evict_mem.mm_node = NULL; evict_mem.placement = TTM_PL_FLAG_SYSTEM | TTM_PL_FLAG_CACHED; evict_mem.mem_type = TTM_PL_SYSTEM; ret = ttm_bo_handle_move_mem(bo, &evict_mem, true, false, false); if (unlikely(ret != 0)) goto out; } ttm_bo_unmap_virtual(bo); /** * Swap out. Buffer will be swapped in again as soon as * anyone tries to access a ttm page. */ if (bo->bdev->driver->swap_notify) bo->bdev->driver->swap_notify(bo); ret = ttm_tt_swapout(bo->ttm, bo->persistent_swap_storage); out: /** * * Unreserve without putting on LRU to avoid swapping out an * already swapped buffer. */ atomic_set(&bo->reserved, 0); wakeup(bo); if (refcount_release(&bo->list_kref)) ttm_bo_release_list(bo); return ret; } void ttm_bo_swapout_all(struct ttm_bo_device *bdev) { while (ttm_bo_swapout(&bdev->glob->shrink) == 0) ; } Index: head/sys/dev/drm2/ttm/ttm_page_alloc.c =================================================================== --- head/sys/dev/drm2/ttm/ttm_page_alloc.c (revision 292468) +++ head/sys/dev/drm2/ttm/ttm_page_alloc.c (revision 292469) @@ -1,929 +1,925 @@ /* * Copyright (c) Red Hat Inc. * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sub license, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the * next paragraph) shall be included in all copies or substantial portions * of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. * * Authors: Dave Airlie * Jerome Glisse * Pauli Nieminen */ /* * Copyright (c) 2013 The FreeBSD Foundation * All rights reserved. * * Portions of this software were developed by Konstantin Belousov * under sponsorship from the FreeBSD Foundation. */ /* simple list based uncached page pool * - Pool collects resently freed pages for reuse * - Use page->lru to keep a free list * - doesn't track currently in use pages */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #define NUM_PAGES_TO_ALLOC (PAGE_SIZE/sizeof(vm_page_t)) #define SMALL_ALLOCATION 16 #define FREE_ALL_PAGES (~0U) /* times are in msecs */ #define PAGE_FREE_INTERVAL 1000 /** * struct ttm_page_pool - Pool to reuse recently allocated uc/wc pages. * * @lock: Protects the shared pool from concurrnet access. Must be used with * irqsave/irqrestore variants because pool allocator maybe called from * delayed work. * @fill_lock: Prevent concurrent calls to fill. * @list: Pool of free uc/wc pages for fast reuse. * @gfp_flags: Flags to pass for alloc_page. * @npages: Number of pages in pool. */ struct ttm_page_pool { struct mtx lock; bool fill_lock; bool dma32; struct pglist list; int ttm_page_alloc_flags; unsigned npages; char *name; unsigned long nfrees; unsigned long nrefills; }; /** * Limits for the pool. They are handled without locks because only place where * they may change is in sysfs store. They won't have immediate effect anyway * so forcing serialization to access them is pointless. */ struct ttm_pool_opts { unsigned alloc_size; unsigned max_size; unsigned small; }; #define NUM_POOLS 4 /** * struct ttm_pool_manager - Holds memory pools for fst allocation * * Manager is read only object for pool code so it doesn't need locking. * * @free_interval: minimum number of jiffies between freeing pages from pool. * @page_alloc_inited: reference counting for pool allocation. * @work: Work that is used to shrink the pool. Work is only run when there is * some pages to free. * @small_allocation: Limit in number of pages what is small allocation. * * @pools: All pool objects in use. **/ struct ttm_pool_manager { unsigned int kobj_ref; eventhandler_tag lowmem_handler; struct ttm_pool_opts options; union { struct ttm_page_pool u_pools[NUM_POOLS]; struct _utag { struct ttm_page_pool u_wc_pool; struct ttm_page_pool u_uc_pool; struct ttm_page_pool u_wc_pool_dma32; struct ttm_page_pool u_uc_pool_dma32; } _ut; } _u; }; #define pools _u.u_pools #define wc_pool _u._ut.u_wc_pool #define uc_pool _u._ut.u_uc_pool #define wc_pool_dma32 _u._ut.u_wc_pool_dma32 #define uc_pool_dma32 _u._ut.u_uc_pool_dma32 MALLOC_DEFINE(M_TTM_POOLMGR, "ttm_poolmgr", "TTM Pool Manager"); static void ttm_vm_page_free(vm_page_t m) { KASSERT(m->object == NULL, ("ttm page %p is owned", m)); KASSERT(m->wire_count == 1, ("ttm lost wire %p", m)); KASSERT((m->flags & PG_FICTITIOUS) != 0, ("ttm lost fictitious %p", m)); KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("ttm got unmanaged %p", m)); m->flags &= ~PG_FICTITIOUS; m->oflags |= VPO_UNMANAGED; vm_page_unwire(m, PQ_INACTIVE); vm_page_free(m); } static vm_memattr_t ttm_caching_state_to_vm(enum ttm_caching_state cstate) { switch (cstate) { case tt_uncached: return (VM_MEMATTR_UNCACHEABLE); case tt_wc: return (VM_MEMATTR_WRITE_COMBINING); case tt_cached: return (VM_MEMATTR_WRITE_BACK); } panic("caching state %d\n", cstate); } static vm_page_t ttm_vm_page_alloc_dma32(int req, vm_memattr_t memattr) { vm_page_t p; int tries; for (tries = 0; ; tries++) { p = vm_page_alloc_contig(NULL, 0, req, 1, 0, 0xffffffff, PAGE_SIZE, 0, memattr); if (p != NULL || tries > 2) return (p); - - /* - * Before growing the cache see if this is just a normal - * memory shortage. - */ - VM_WAIT; - vm_pageout_grow_cache(tries, 0, 0xffffffff); + if (!vm_page_reclaim_contig(req, 1, 0, 0xffffffff, + PAGE_SIZE, 0)) + VM_WAIT; } } static vm_page_t ttm_vm_page_alloc_any(int req, vm_memattr_t memattr) { vm_page_t p; while (1) { p = vm_page_alloc(NULL, 0, req); if (p != NULL) break; VM_WAIT; } pmap_page_set_memattr(p, memattr); return (p); } static vm_page_t ttm_vm_page_alloc(int flags, enum ttm_caching_state cstate) { vm_page_t p; vm_memattr_t memattr; int req; memattr = ttm_caching_state_to_vm(cstate); req = VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_NOOBJ; if ((flags & TTM_PAGE_FLAG_ZERO_ALLOC) != 0) req |= VM_ALLOC_ZERO; if ((flags & TTM_PAGE_FLAG_DMA32) != 0) p = ttm_vm_page_alloc_dma32(req, memattr); else p = ttm_vm_page_alloc_any(req, memattr); if (p != NULL) { p->oflags &= ~VPO_UNMANAGED; p->flags |= PG_FICTITIOUS; } return (p); } static void ttm_pool_kobj_release(struct ttm_pool_manager *m) { free(m, M_TTM_POOLMGR); } #if 0 /* XXXKIB sysctl */ static ssize_t ttm_pool_store(struct ttm_pool_manager *m, struct attribute *attr, const char *buffer, size_t size) { int chars; unsigned val; chars = sscanf(buffer, "%u", &val); if (chars == 0) return size; /* Convert kb to number of pages */ val = val / (PAGE_SIZE >> 10); if (attr == &ttm_page_pool_max) m->options.max_size = val; else if (attr == &ttm_page_pool_small) m->options.small = val; else if (attr == &ttm_page_pool_alloc_size) { if (val > NUM_PAGES_TO_ALLOC*8) { pr_err("Setting allocation size to %lu is not allowed. Recommended size is %lu\n", NUM_PAGES_TO_ALLOC*(PAGE_SIZE >> 7), NUM_PAGES_TO_ALLOC*(PAGE_SIZE >> 10)); return size; } else if (val > NUM_PAGES_TO_ALLOC) { pr_warn("Setting allocation size to larger than %lu is not recommended\n", NUM_PAGES_TO_ALLOC*(PAGE_SIZE >> 10)); } m->options.alloc_size = val; } return size; } static ssize_t ttm_pool_show(struct ttm_pool_manager *m, struct attribute *attr, char *buffer) { unsigned val = 0; if (attr == &ttm_page_pool_max) val = m->options.max_size; else if (attr == &ttm_page_pool_small) val = m->options.small; else if (attr == &ttm_page_pool_alloc_size) val = m->options.alloc_size; val = val * (PAGE_SIZE >> 10); return snprintf(buffer, PAGE_SIZE, "%u\n", val); } #endif static struct ttm_pool_manager *_manager; static int set_pages_array_wb(vm_page_t *pages, int addrinarray) { #ifdef TTM_HAS_AGP int i; for (i = 0; i < addrinarray; i++) pmap_page_set_memattr(pages[i], VM_MEMATTR_WRITE_BACK); #endif return 0; } static int set_pages_array_wc(vm_page_t *pages, int addrinarray) { #ifdef TTM_HAS_AGP int i; for (i = 0; i < addrinarray; i++) pmap_page_set_memattr(pages[i], VM_MEMATTR_WRITE_COMBINING); #endif return 0; } static int set_pages_array_uc(vm_page_t *pages, int addrinarray) { #ifdef TTM_HAS_AGP int i; for (i = 0; i < addrinarray; i++) pmap_page_set_memattr(pages[i], VM_MEMATTR_UNCACHEABLE); #endif return 0; } /** * Select the right pool or requested caching state and ttm flags. */ static struct ttm_page_pool *ttm_get_pool(int flags, enum ttm_caching_state cstate) { int pool_index; if (cstate == tt_cached) return NULL; if (cstate == tt_wc) pool_index = 0x0; else pool_index = 0x1; if (flags & TTM_PAGE_FLAG_DMA32) pool_index |= 0x2; return &_manager->pools[pool_index]; } /* set memory back to wb and free the pages. */ static void ttm_pages_put(vm_page_t *pages, unsigned npages) { unsigned i; /* Our VM handles vm memattr automatically on the page free. */ if (set_pages_array_wb(pages, npages)) printf("[TTM] Failed to set %d pages to wb!\n", npages); for (i = 0; i < npages; ++i) ttm_vm_page_free(pages[i]); } static void ttm_pool_update_free_locked(struct ttm_page_pool *pool, unsigned freed_pages) { pool->npages -= freed_pages; pool->nfrees += freed_pages; } /** * Free pages from pool. * * To prevent hogging the ttm_swap process we only free NUM_PAGES_TO_ALLOC * number of pages in one go. * * @pool: to free the pages from * @free_all: If set to true will free all pages in pool **/ static int ttm_page_pool_free(struct ttm_page_pool *pool, unsigned nr_free) { vm_page_t p, p1; vm_page_t *pages_to_free; unsigned freed_pages = 0, npages_to_free = nr_free; unsigned i; if (NUM_PAGES_TO_ALLOC < nr_free) npages_to_free = NUM_PAGES_TO_ALLOC; pages_to_free = malloc(npages_to_free * sizeof(vm_page_t), M_TEMP, M_WAITOK | M_ZERO); restart: mtx_lock(&pool->lock); TAILQ_FOREACH_REVERSE_SAFE(p, &pool->list, pglist, plinks.q, p1) { if (freed_pages >= npages_to_free) break; pages_to_free[freed_pages++] = p; /* We can only remove NUM_PAGES_TO_ALLOC at a time. */ if (freed_pages >= NUM_PAGES_TO_ALLOC) { /* remove range of pages from the pool */ for (i = 0; i < freed_pages; i++) TAILQ_REMOVE(&pool->list, pages_to_free[i], plinks.q); ttm_pool_update_free_locked(pool, freed_pages); /** * Because changing page caching is costly * we unlock the pool to prevent stalling. */ mtx_unlock(&pool->lock); ttm_pages_put(pages_to_free, freed_pages); if (likely(nr_free != FREE_ALL_PAGES)) nr_free -= freed_pages; if (NUM_PAGES_TO_ALLOC >= nr_free) npages_to_free = nr_free; else npages_to_free = NUM_PAGES_TO_ALLOC; freed_pages = 0; /* free all so restart the processing */ if (nr_free) goto restart; /* Not allowed to fall through or break because * following context is inside spinlock while we are * outside here. */ goto out; } } /* remove range of pages from the pool */ if (freed_pages) { for (i = 0; i < freed_pages; i++) TAILQ_REMOVE(&pool->list, pages_to_free[i], plinks.q); ttm_pool_update_free_locked(pool, freed_pages); nr_free -= freed_pages; } mtx_unlock(&pool->lock); if (freed_pages) ttm_pages_put(pages_to_free, freed_pages); out: free(pages_to_free, M_TEMP); return nr_free; } /* Get good estimation how many pages are free in pools */ static int ttm_pool_get_num_unused_pages(void) { unsigned i; int total = 0; for (i = 0; i < NUM_POOLS; ++i) total += _manager->pools[i].npages; return total; } /** * Callback for mm to request pool to reduce number of page held. */ static int ttm_pool_mm_shrink(void *arg) { static unsigned int start_pool = 0; unsigned i; unsigned pool_offset = atomic_fetchadd_int(&start_pool, 1); struct ttm_page_pool *pool; int shrink_pages = 100; /* XXXKIB */ pool_offset = pool_offset % NUM_POOLS; /* select start pool in round robin fashion */ for (i = 0; i < NUM_POOLS; ++i) { unsigned nr_free = shrink_pages; if (shrink_pages == 0) break; pool = &_manager->pools[(i + pool_offset)%NUM_POOLS]; shrink_pages = ttm_page_pool_free(pool, nr_free); } /* return estimated number of unused pages in pool */ return ttm_pool_get_num_unused_pages(); } static void ttm_pool_mm_shrink_init(struct ttm_pool_manager *manager) { manager->lowmem_handler = EVENTHANDLER_REGISTER(vm_lowmem, ttm_pool_mm_shrink, manager, EVENTHANDLER_PRI_ANY); } static void ttm_pool_mm_shrink_fini(struct ttm_pool_manager *manager) { EVENTHANDLER_DEREGISTER(vm_lowmem, manager->lowmem_handler); } static int ttm_set_pages_caching(vm_page_t *pages, enum ttm_caching_state cstate, unsigned cpages) { int r = 0; /* Set page caching */ switch (cstate) { case tt_uncached: r = set_pages_array_uc(pages, cpages); if (r) printf("[TTM] Failed to set %d pages to uc!\n", cpages); break; case tt_wc: r = set_pages_array_wc(pages, cpages); if (r) printf("[TTM] Failed to set %d pages to wc!\n", cpages); break; default: break; } return r; } /** * Free pages the pages that failed to change the caching state. If there is * any pages that have changed their caching state already put them to the * pool. */ static void ttm_handle_caching_state_failure(struct pglist *pages, int ttm_flags, enum ttm_caching_state cstate, vm_page_t *failed_pages, unsigned cpages) { unsigned i; /* Failed pages have to be freed */ for (i = 0; i < cpages; ++i) { TAILQ_REMOVE(pages, failed_pages[i], plinks.q); ttm_vm_page_free(failed_pages[i]); } } /** * Allocate new pages with correct caching. * * This function is reentrant if caller updates count depending on number of * pages returned in pages array. */ static int ttm_alloc_new_pages(struct pglist *pages, int ttm_alloc_flags, int ttm_flags, enum ttm_caching_state cstate, unsigned count) { vm_page_t *caching_array; vm_page_t p; int r = 0; unsigned i, cpages; unsigned max_cpages = min(count, (unsigned)(PAGE_SIZE/sizeof(vm_page_t))); /* allocate array for page caching change */ caching_array = malloc(max_cpages * sizeof(vm_page_t), M_TEMP, M_WAITOK | M_ZERO); for (i = 0, cpages = 0; i < count; ++i) { p = ttm_vm_page_alloc(ttm_alloc_flags, cstate); if (!p) { printf("[TTM] Unable to get page %u\n", i); /* store already allocated pages in the pool after * setting the caching state */ if (cpages) { r = ttm_set_pages_caching(caching_array, cstate, cpages); if (r) ttm_handle_caching_state_failure(pages, ttm_flags, cstate, caching_array, cpages); } r = -ENOMEM; goto out; } #ifdef CONFIG_HIGHMEM /* KIB: nop */ /* gfp flags of highmem page should never be dma32 so we * we should be fine in such case */ if (!PageHighMem(p)) #endif { caching_array[cpages++] = p; if (cpages == max_cpages) { r = ttm_set_pages_caching(caching_array, cstate, cpages); if (r) { ttm_handle_caching_state_failure(pages, ttm_flags, cstate, caching_array, cpages); goto out; } cpages = 0; } } TAILQ_INSERT_HEAD(pages, p, plinks.q); } if (cpages) { r = ttm_set_pages_caching(caching_array, cstate, cpages); if (r) ttm_handle_caching_state_failure(pages, ttm_flags, cstate, caching_array, cpages); } out: free(caching_array, M_TEMP); return r; } /** * Fill the given pool if there aren't enough pages and the requested number of * pages is small. */ static void ttm_page_pool_fill_locked(struct ttm_page_pool *pool, int ttm_flags, enum ttm_caching_state cstate, unsigned count) { vm_page_t p; int r; unsigned cpages = 0; /** * Only allow one pool fill operation at a time. * If pool doesn't have enough pages for the allocation new pages are * allocated from outside of pool. */ if (pool->fill_lock) return; pool->fill_lock = true; /* If allocation request is small and there are not enough * pages in a pool we fill the pool up first. */ if (count < _manager->options.small && count > pool->npages) { struct pglist new_pages; unsigned alloc_size = _manager->options.alloc_size; /** * Can't change page caching if in irqsave context. We have to * drop the pool->lock. */ mtx_unlock(&pool->lock); TAILQ_INIT(&new_pages); r = ttm_alloc_new_pages(&new_pages, pool->ttm_page_alloc_flags, ttm_flags, cstate, alloc_size); mtx_lock(&pool->lock); if (!r) { TAILQ_CONCAT(&pool->list, &new_pages, plinks.q); ++pool->nrefills; pool->npages += alloc_size; } else { printf("[TTM] Failed to fill pool (%p)\n", pool); /* If we have any pages left put them to the pool. */ TAILQ_FOREACH(p, &pool->list, plinks.q) { ++cpages; } TAILQ_CONCAT(&pool->list, &new_pages, plinks.q); pool->npages += cpages; } } pool->fill_lock = false; } /** * Cut 'count' number of pages from the pool and put them on the return list. * * @return count of pages still required to fulfill the request. */ static unsigned ttm_page_pool_get_pages(struct ttm_page_pool *pool, struct pglist *pages, int ttm_flags, enum ttm_caching_state cstate, unsigned count) { vm_page_t p; unsigned i; mtx_lock(&pool->lock); ttm_page_pool_fill_locked(pool, ttm_flags, cstate, count); if (count >= pool->npages) { /* take all pages from the pool */ TAILQ_CONCAT(pages, &pool->list, plinks.q); count -= pool->npages; pool->npages = 0; goto out; } for (i = 0; i < count; i++) { p = TAILQ_FIRST(&pool->list); TAILQ_REMOVE(&pool->list, p, plinks.q); TAILQ_INSERT_TAIL(pages, p, plinks.q); } pool->npages -= count; count = 0; out: mtx_unlock(&pool->lock); return count; } /* Put all pages in pages list to correct pool to wait for reuse */ static void ttm_put_pages(vm_page_t *pages, unsigned npages, int flags, enum ttm_caching_state cstate) { struct ttm_page_pool *pool = ttm_get_pool(flags, cstate); unsigned i; if (pool == NULL) { /* No pool for this memory type so free the pages */ for (i = 0; i < npages; i++) { if (pages[i]) { ttm_vm_page_free(pages[i]); pages[i] = NULL; } } return; } mtx_lock(&pool->lock); for (i = 0; i < npages; i++) { if (pages[i]) { TAILQ_INSERT_TAIL(&pool->list, pages[i], plinks.q); pages[i] = NULL; pool->npages++; } } /* Check that we don't go over the pool limit */ npages = 0; if (pool->npages > _manager->options.max_size) { npages = pool->npages - _manager->options.max_size; /* free at least NUM_PAGES_TO_ALLOC number of pages * to reduce calls to set_memory_wb */ if (npages < NUM_PAGES_TO_ALLOC) npages = NUM_PAGES_TO_ALLOC; } mtx_unlock(&pool->lock); if (npages) ttm_page_pool_free(pool, npages); } /* * On success pages list will hold count number of correctly * cached pages. */ static int ttm_get_pages(vm_page_t *pages, unsigned npages, int flags, enum ttm_caching_state cstate) { struct ttm_page_pool *pool = ttm_get_pool(flags, cstate); struct pglist plist; vm_page_t p = NULL; int gfp_flags; unsigned count; int r; /* No pool for cached pages */ if (pool == NULL) { for (r = 0; r < npages; ++r) { p = ttm_vm_page_alloc(flags, cstate); if (!p) { printf("[TTM] Unable to allocate page\n"); return -ENOMEM; } pages[r] = p; } return 0; } /* combine zero flag to pool flags */ gfp_flags = flags | pool->ttm_page_alloc_flags; /* First we take pages from the pool */ TAILQ_INIT(&plist); npages = ttm_page_pool_get_pages(pool, &plist, flags, cstate, npages); count = 0; TAILQ_FOREACH(p, &plist, plinks.q) { pages[count++] = p; } /* clear the pages coming from the pool if requested */ if (flags & TTM_PAGE_FLAG_ZERO_ALLOC) { TAILQ_FOREACH(p, &plist, plinks.q) { pmap_zero_page(p); } } /* If pool didn't have enough pages allocate new one. */ if (npages > 0) { /* ttm_alloc_new_pages doesn't reference pool so we can run * multiple requests in parallel. **/ TAILQ_INIT(&plist); r = ttm_alloc_new_pages(&plist, gfp_flags, flags, cstate, npages); TAILQ_FOREACH(p, &plist, plinks.q) { pages[count++] = p; } if (r) { /* If there is any pages in the list put them back to * the pool. */ printf("[TTM] Failed to allocate extra pages for large request\n"); ttm_put_pages(pages, count, flags, cstate); return r; } } return 0; } static void ttm_page_pool_init_locked(struct ttm_page_pool *pool, int flags, char *name) { mtx_init(&pool->lock, "ttmpool", NULL, MTX_DEF); pool->fill_lock = false; TAILQ_INIT(&pool->list); pool->npages = pool->nfrees = 0; pool->ttm_page_alloc_flags = flags; pool->name = name; } int ttm_page_alloc_init(struct ttm_mem_global *glob, unsigned max_pages) { if (_manager != NULL) printf("[TTM] manager != NULL\n"); printf("[TTM] Initializing pool allocator\n"); _manager = malloc(sizeof(*_manager), M_TTM_POOLMGR, M_WAITOK | M_ZERO); ttm_page_pool_init_locked(&_manager->wc_pool, 0, "wc"); ttm_page_pool_init_locked(&_manager->uc_pool, 0, "uc"); ttm_page_pool_init_locked(&_manager->wc_pool_dma32, TTM_PAGE_FLAG_DMA32, "wc dma"); ttm_page_pool_init_locked(&_manager->uc_pool_dma32, TTM_PAGE_FLAG_DMA32, "uc dma"); _manager->options.max_size = max_pages; _manager->options.small = SMALL_ALLOCATION; _manager->options.alloc_size = NUM_PAGES_TO_ALLOC; refcount_init(&_manager->kobj_ref, 1); ttm_pool_mm_shrink_init(_manager); return 0; } void ttm_page_alloc_fini(void) { int i; printf("[TTM] Finalizing pool allocator\n"); ttm_pool_mm_shrink_fini(_manager); for (i = 0; i < NUM_POOLS; ++i) ttm_page_pool_free(&_manager->pools[i], FREE_ALL_PAGES); if (refcount_release(&_manager->kobj_ref)) ttm_pool_kobj_release(_manager); _manager = NULL; } int ttm_pool_populate(struct ttm_tt *ttm) { struct ttm_mem_global *mem_glob = ttm->glob->mem_glob; unsigned i; int ret; if (ttm->state != tt_unpopulated) return 0; for (i = 0; i < ttm->num_pages; ++i) { ret = ttm_get_pages(&ttm->pages[i], 1, ttm->page_flags, ttm->caching_state); if (ret != 0) { ttm_pool_unpopulate(ttm); return -ENOMEM; } ret = ttm_mem_global_alloc_page(mem_glob, ttm->pages[i], false, false); if (unlikely(ret != 0)) { ttm_pool_unpopulate(ttm); return -ENOMEM; } } if (unlikely(ttm->page_flags & TTM_PAGE_FLAG_SWAPPED)) { ret = ttm_tt_swapin(ttm); if (unlikely(ret != 0)) { ttm_pool_unpopulate(ttm); return ret; } } ttm->state = tt_unbound; return 0; } void ttm_pool_unpopulate(struct ttm_tt *ttm) { unsigned i; for (i = 0; i < ttm->num_pages; ++i) { if (ttm->pages[i]) { ttm_mem_global_free_page(ttm->glob->mem_glob, ttm->pages[i]); ttm_put_pages(&ttm->pages[i], 1, ttm->page_flags, ttm->caching_state); } } ttm->state = tt_unpopulated; } #if 0 /* XXXKIB sysctl */ int ttm_page_alloc_debugfs(struct seq_file *m, void *data) { struct ttm_page_pool *p; unsigned i; char *h[] = {"pool", "refills", "pages freed", "size"}; if (!_manager) { seq_printf(m, "No pool allocator running.\n"); return 0; } seq_printf(m, "%6s %12s %13s %8s\n", h[0], h[1], h[2], h[3]); for (i = 0; i < NUM_POOLS; ++i) { p = &_manager->pools[i]; seq_printf(m, "%6s %12ld %13ld %8d\n", p->name, p->nrefills, p->nfrees, p->npages); } return 0; } #endif Index: head/sys/mips/include/pmap.h =================================================================== --- head/sys/mips/include/pmap.h (revision 292468) +++ head/sys/mips/include/pmap.h (revision 292469) @@ -1,188 +1,187 @@ /*- * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * Derived from hp300 version by Mike Hibler, this version by William * Jolitz uses a recursive map [a pde points to the page directory] to * map the page tables using the pagetables themselves. This is done to * reduce the impact on kernel virtual memory for lots of sparse address * space, and to reduce the cost of memory to each process. * * from: hp300: @(#)pmap.h 7.2 (Berkeley) 12/16/90 * from: @(#)pmap.h 7.4 (Berkeley) 5/12/91 * from: src/sys/i386/include/pmap.h,v 1.65.2.2 2000/11/30 01:54:42 peter * JNPR: pmap.h,v 1.7.2.1 2007/09/10 07:44:12 girish * $FreeBSD$ */ #ifndef _MACHINE_PMAP_H_ #define _MACHINE_PMAP_H_ #include #include #if defined(__mips_n32) || defined(__mips_n64) /* PHYSADDR_64BIT */ #define NKPT 256 /* mem > 4G, vm_page_startup needs more KPTs */ #else #define NKPT 120 /* actual number of kernel page tables */ #endif #ifndef LOCORE #include #include #include #include /* * Pmap stuff */ struct pv_entry; struct pv_chunk; struct md_page { int pv_flags; TAILQ_HEAD(, pv_entry) pv_list; }; #define PV_TABLE_REF 0x02 /* referenced */ #define PV_MEMATTR_UNCACHEABLE 0x04 #define ASID_BITS 8 #define ASIDGEN_BITS (32 - ASID_BITS) #define ASIDGEN_MASK ((1 << ASIDGEN_BITS) - 1) struct pmap { pd_entry_t *pm_segtab; /* KVA of segment table */ TAILQ_HEAD(, pv_chunk) pm_pvchunk; /* list of mappings in pmap */ cpuset_t pm_active; /* active on cpus */ struct { u_int32_t asid:ASID_BITS; /* TLB address space tag */ u_int32_t gen:ASIDGEN_BITS; /* its generation number */ } pm_asid[MAXSMPCPU]; struct pmap_statistics pm_stats; /* pmap statistics */ struct mtx pm_mtx; }; typedef struct pmap *pmap_t; #ifdef _KERNEL pt_entry_t *pmap_pte(pmap_t, vm_offset_t); vm_paddr_t pmap_kextract(vm_offset_t va); #define vtophys(va) pmap_kextract(((vm_offset_t) (va))) #define pmap_asid(pmap) (pmap)->pm_asid[PCPU_GET(cpuid)].asid extern struct pmap kernel_pmap_store; #define kernel_pmap (&kernel_pmap_store) #define PMAP_LOCK(pmap) mtx_lock(&(pmap)->pm_mtx) #define PMAP_LOCK_ASSERT(pmap, type) mtx_assert(&(pmap)->pm_mtx, (type)) #define PMAP_LOCK_DESTROY(pmap) mtx_destroy(&(pmap)->pm_mtx) #define PMAP_LOCK_INIT(pmap) mtx_init(&(pmap)->pm_mtx, "pmap", \ NULL, MTX_DEF) #define PMAP_LOCKED(pmap) mtx_owned(&(pmap)->pm_mtx) #define PMAP_MTX(pmap) (&(pmap)->pm_mtx) #define PMAP_TRYLOCK(pmap) mtx_trylock(&(pmap)->pm_mtx) #define PMAP_UNLOCK(pmap) mtx_unlock(&(pmap)->pm_mtx) /* * For each vm_page_t, there is a list of all currently valid virtual * mappings of that page. An entry is a pv_entry_t, the list is pv_table. */ typedef struct pv_entry { vm_offset_t pv_va; /* virtual address for mapping */ TAILQ_ENTRY(pv_entry) pv_list; } *pv_entry_t; /* * pv_entries are allocated in chunks per-process. This avoids the * need to track per-pmap assignments. */ #ifdef __mips_n64 #define _NPCM 3 #define _NPCPV 168 #else #define _NPCM 11 #define _NPCPV 336 #endif struct pv_chunk { pmap_t pc_pmap; TAILQ_ENTRY(pv_chunk) pc_list; u_long pc_map[_NPCM]; /* bitmap; 1 = free */ TAILQ_ENTRY(pv_chunk) pc_lru; struct pv_entry pc_pventry[_NPCPV]; }; /* * physmem_desc[] is a superset of phys_avail[] and describes all the * memory present in the system. * * phys_avail[] is similar but does not include the memory stolen by * pmap_steal_memory(). * * Each memory region is described by a pair of elements in the array * so we can describe up to (PHYS_AVAIL_ENTRIES / 2) distinct memory * regions. */ #define PHYS_AVAIL_ENTRIES 10 extern vm_paddr_t phys_avail[PHYS_AVAIL_ENTRIES + 2]; extern vm_paddr_t physmem_desc[PHYS_AVAIL_ENTRIES + 2]; extern vm_offset_t virtual_avail; extern vm_offset_t virtual_end; extern vm_paddr_t dump_avail[PHYS_AVAIL_ENTRIES + 2]; #define pmap_page_get_memattr(m) VM_MEMATTR_DEFAULT #define pmap_page_is_mapped(m) (!TAILQ_EMPTY(&(m)->md.pv_list)) #define pmap_page_is_write_mapped(m) (((m)->aflags & PGA_WRITEABLE) != 0) void pmap_bootstrap(void); void *pmap_mapdev(vm_paddr_t, vm_size_t); void pmap_unmapdev(vm_offset_t, vm_size_t); vm_offset_t pmap_steal_memory(vm_size_t size); void pmap_kenter(vm_offset_t va, vm_paddr_t pa); void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int attr); void pmap_kremove(vm_offset_t va); void *pmap_kenter_temporary(vm_paddr_t pa, int i); void pmap_kenter_temporary_free(vm_paddr_t pa); void pmap_flush_pvcache(vm_page_t m); int pmap_emulate_modified(pmap_t pmap, vm_offset_t va); -void pmap_grow_direct_page_cache(void); void pmap_page_set_memattr(vm_page_t, vm_memattr_t); #endif /* _KERNEL */ #endif /* !LOCORE */ #endif /* !_MACHINE_PMAP_H_ */ Index: head/sys/mips/mips/pmap.c =================================================================== --- head/sys/mips/mips/pmap.c (revision 292468) +++ head/sys/mips/mips/pmap.c (revision 292469) @@ -1,3558 +1,3571 @@ /* * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 * from: src/sys/i386/i386/pmap.c,v 1.250.2.8 2000/11/21 00:09:14 ps * JNPR: pmap.c,v 1.11.2.1 2007/08/16 11:51:06 girish */ /* * Manages physical address maps. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_pmap.h" #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #else #include #endif #include #include #ifdef DDB #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #undef PMAP_DEBUG #if !defined(DIAGNOSTIC) #define PMAP_INLINE __inline #else #define PMAP_INLINE #endif #ifdef PV_STATS #define PV_STAT(x) do { x ; } while (0) #else #define PV_STAT(x) do { } while (0) #endif /* * Get PDEs and PTEs for user/kernel address space */ #define pmap_seg_index(v) (((v) >> SEGSHIFT) & (NPDEPG - 1)) #define pmap_pde_index(v) (((v) >> PDRSHIFT) & (NPDEPG - 1)) #define pmap_pte_index(v) (((v) >> PAGE_SHIFT) & (NPTEPG - 1)) #define pmap_pde_pindex(v) ((v) >> PDRSHIFT) #ifdef __mips_n64 #define NUPDE (NPDEPG * NPDEPG) #define NUSERPGTBLS (NUPDE + NPDEPG) #else #define NUPDE (NPDEPG) #define NUSERPGTBLS (NUPDE) #endif #define is_kernel_pmap(x) ((x) == kernel_pmap) struct pmap kernel_pmap_store; pd_entry_t *kernel_segmap; vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ static int nkpt; unsigned pmap_max_asid; /* max ASID supported by the system */ #define PMAP_ASID_RESERVED 0 vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS; static void pmap_asid_alloc(pmap_t pmap); static struct rwlock_padalign pvh_global_lock; /* * Data for the pv entry allocation mechanism */ static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); static int pv_entry_count; static void free_pv_chunk(struct pv_chunk *pc); static void free_pv_entry(pmap_t pmap, pv_entry_t pv); static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try); static vm_page_t pmap_pv_reclaim(pmap_t locked_pmap); static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static vm_page_t pmap_alloc_direct_page(unsigned int index, int req); static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte); +static void pmap_grow_direct_page(int req); static int pmap_remove_pte(struct pmap *pmap, pt_entry_t *ptq, vm_offset_t va, pd_entry_t pde); static void pmap_remove_page(struct pmap *pmap, vm_offset_t va); static void pmap_remove_entry(struct pmap *pmap, vm_page_t m, vm_offset_t va); static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_page_t mpte, vm_offset_t va, vm_page_t m); static void pmap_update_page(pmap_t pmap, vm_offset_t va, pt_entry_t pte); static void pmap_invalidate_all(pmap_t pmap); static void pmap_invalidate_page(pmap_t pmap, vm_offset_t va); static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m); static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags); static vm_page_t _pmap_allocpte(pmap_t pmap, unsigned ptepindex, u_int flags); static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t); static pt_entry_t init_pte_prot(vm_page_t m, vm_prot_t access, vm_prot_t prot); static void pmap_invalidate_page_action(void *arg); static void pmap_invalidate_range_action(void *arg); static void pmap_update_page_action(void *arg); #ifndef __mips_n64 /* * This structure is for high memory (memory above 512Meg in 32 bit) support. * The highmem area does not have a KSEG0 mapping, and we need a mechanism to * do temporary per-CPU mappings for pmap_zero_page, pmap_copy_page etc. * * At bootup, we reserve 2 virtual pages per CPU for mapping highmem pages. To * access a highmem physical address on a CPU, we map the physical address to * the reserved virtual address for the CPU in the kernel pagetable. This is * done with interrupts disabled(although a spinlock and sched_pin would be * sufficient). */ struct local_sysmaps { vm_offset_t base; uint32_t saved_intr; uint16_t valid1, valid2; }; static struct local_sysmaps sysmap_lmem[MAXCPU]; static __inline void pmap_alloc_lmem_map(void) { int i; for (i = 0; i < MAXCPU; i++) { sysmap_lmem[i].base = virtual_avail; virtual_avail += PAGE_SIZE * 2; sysmap_lmem[i].valid1 = sysmap_lmem[i].valid2 = 0; } } static __inline vm_offset_t pmap_lmem_map1(vm_paddr_t phys) { struct local_sysmaps *sysm; pt_entry_t *pte, npte; vm_offset_t va; uint32_t intr; int cpu; intr = intr_disable(); cpu = PCPU_GET(cpuid); sysm = &sysmap_lmem[cpu]; sysm->saved_intr = intr; va = sysm->base; npte = TLBLO_PA_TO_PFN(phys) | PTE_C_CACHE | PTE_D | PTE_V | PTE_G; pte = pmap_pte(kernel_pmap, va); *pte = npte; sysm->valid1 = 1; return (va); } static __inline vm_offset_t pmap_lmem_map2(vm_paddr_t phys1, vm_paddr_t phys2) { struct local_sysmaps *sysm; pt_entry_t *pte, npte; vm_offset_t va1, va2; uint32_t intr; int cpu; intr = intr_disable(); cpu = PCPU_GET(cpuid); sysm = &sysmap_lmem[cpu]; sysm->saved_intr = intr; va1 = sysm->base; va2 = sysm->base + PAGE_SIZE; npte = TLBLO_PA_TO_PFN(phys1) | PTE_C_CACHE | PTE_D | PTE_V | PTE_G; pte = pmap_pte(kernel_pmap, va1); *pte = npte; npte = TLBLO_PA_TO_PFN(phys2) | PTE_C_CACHE | PTE_D | PTE_V | PTE_G; pte = pmap_pte(kernel_pmap, va2); *pte = npte; sysm->valid1 = 1; sysm->valid2 = 1; return (va1); } static __inline void pmap_lmem_unmap(void) { struct local_sysmaps *sysm; pt_entry_t *pte; int cpu; cpu = PCPU_GET(cpuid); sysm = &sysmap_lmem[cpu]; pte = pmap_pte(kernel_pmap, sysm->base); *pte = PTE_G; tlb_invalidate_address(kernel_pmap, sysm->base); sysm->valid1 = 0; if (sysm->valid2) { pte = pmap_pte(kernel_pmap, sysm->base + PAGE_SIZE); *pte = PTE_G; tlb_invalidate_address(kernel_pmap, sysm->base + PAGE_SIZE); sysm->valid2 = 0; } intr_restore(sysm->saved_intr); } #else /* __mips_n64 */ static __inline void pmap_alloc_lmem_map(void) { } static __inline vm_offset_t pmap_lmem_map1(vm_paddr_t phys) { return (0); } static __inline vm_offset_t pmap_lmem_map2(vm_paddr_t phys1, vm_paddr_t phys2) { return (0); } static __inline vm_offset_t pmap_lmem_unmap(void) { return (0); } #endif /* !__mips_n64 */ static __inline int is_cacheable_page(vm_paddr_t pa, vm_page_t m) { return ((m->md.pv_flags & PV_MEMATTR_UNCACHEABLE) == 0 && is_cacheable_mem(pa)); } /* * Page table entry lookup routines. */ static __inline pd_entry_t * pmap_segmap(pmap_t pmap, vm_offset_t va) { return (&pmap->pm_segtab[pmap_seg_index(va)]); } #ifdef __mips_n64 static __inline pd_entry_t * pmap_pdpe_to_pde(pd_entry_t *pdpe, vm_offset_t va) { pd_entry_t *pde; pde = (pd_entry_t *)*pdpe; return (&pde[pmap_pde_index(va)]); } static __inline pd_entry_t * pmap_pde(pmap_t pmap, vm_offset_t va) { pd_entry_t *pdpe; pdpe = pmap_segmap(pmap, va); if (*pdpe == NULL) return (NULL); return (pmap_pdpe_to_pde(pdpe, va)); } #else static __inline pd_entry_t * pmap_pdpe_to_pde(pd_entry_t *pdpe, vm_offset_t va) { return (pdpe); } static __inline pd_entry_t *pmap_pde(pmap_t pmap, vm_offset_t va) { return (pmap_segmap(pmap, va)); } #endif static __inline pt_entry_t * pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va) { pt_entry_t *pte; pte = (pt_entry_t *)*pde; return (&pte[pmap_pte_index(va)]); } pt_entry_t * pmap_pte(pmap_t pmap, vm_offset_t va) { pd_entry_t *pde; pde = pmap_pde(pmap, va); if (pde == NULL || *pde == NULL) return (NULL); return (pmap_pde_to_pte(pde, va)); } vm_offset_t pmap_steal_memory(vm_size_t size) { vm_paddr_t bank_size, pa; vm_offset_t va; size = round_page(size); bank_size = phys_avail[1] - phys_avail[0]; while (size > bank_size) { int i; for (i = 0; phys_avail[i + 2]; i += 2) { phys_avail[i] = phys_avail[i + 2]; phys_avail[i + 1] = phys_avail[i + 3]; } phys_avail[i] = 0; phys_avail[i + 1] = 0; if (!phys_avail[0]) panic("pmap_steal_memory: out of memory"); bank_size = phys_avail[1] - phys_avail[0]; } pa = phys_avail[0]; phys_avail[0] += size; if (MIPS_DIRECT_MAPPABLE(pa) == 0) panic("Out of memory below 512Meg?"); va = MIPS_PHYS_TO_DIRECT(pa); bzero((caddr_t)va, size); return (va); } /* * Bootstrap the system enough to run with virtual memory. This * assumes that the phys_avail array has been initialized. */ static void pmap_create_kernel_pagetable(void) { int i, j; vm_offset_t ptaddr; pt_entry_t *pte; #ifdef __mips_n64 pd_entry_t *pde; vm_offset_t pdaddr; int npt, npde; #endif /* * Allocate segment table for the kernel */ kernel_segmap = (pd_entry_t *)pmap_steal_memory(PAGE_SIZE); /* * Allocate second level page tables for the kernel */ #ifdef __mips_n64 npde = howmany(NKPT, NPDEPG); pdaddr = pmap_steal_memory(PAGE_SIZE * npde); #endif nkpt = NKPT; ptaddr = pmap_steal_memory(PAGE_SIZE * nkpt); /* * The R[4-7]?00 stores only one copy of the Global bit in the * translation lookaside buffer for each 2 page entry. Thus invalid * entrys must have the Global bit set so when Entry LO and Entry HI * G bits are anded together they will produce a global bit to store * in the tlb. */ for (i = 0, pte = (pt_entry_t *)ptaddr; i < (nkpt * NPTEPG); i++, pte++) *pte = PTE_G; #ifdef __mips_n64 for (i = 0, npt = nkpt; npt > 0; i++) { kernel_segmap[i] = (pd_entry_t)(pdaddr + i * PAGE_SIZE); pde = (pd_entry_t *)kernel_segmap[i]; for (j = 0; j < NPDEPG && npt > 0; j++, npt--) pde[j] = (pd_entry_t)(ptaddr + (i * NPDEPG + j) * PAGE_SIZE); } #else for (i = 0, j = pmap_seg_index(VM_MIN_KERNEL_ADDRESS); i < nkpt; i++, j++) kernel_segmap[j] = (pd_entry_t)(ptaddr + (i * PAGE_SIZE)); #endif PMAP_LOCK_INIT(kernel_pmap); kernel_pmap->pm_segtab = kernel_segmap; CPU_FILL(&kernel_pmap->pm_active); TAILQ_INIT(&kernel_pmap->pm_pvchunk); kernel_pmap->pm_asid[0].asid = PMAP_ASID_RESERVED; kernel_pmap->pm_asid[0].gen = 0; kernel_vm_end += nkpt * NPTEPG * PAGE_SIZE; } void pmap_bootstrap(void) { int i; int need_local_mappings = 0; /* Sort. */ again: for (i = 0; phys_avail[i + 1] != 0; i += 2) { /* * Keep the memory aligned on page boundary. */ phys_avail[i] = round_page(phys_avail[i]); phys_avail[i + 1] = trunc_page(phys_avail[i + 1]); if (i < 2) continue; if (phys_avail[i - 2] > phys_avail[i]) { vm_paddr_t ptemp[2]; ptemp[0] = phys_avail[i + 0]; ptemp[1] = phys_avail[i + 1]; phys_avail[i + 0] = phys_avail[i - 2]; phys_avail[i + 1] = phys_avail[i - 1]; phys_avail[i - 2] = ptemp[0]; phys_avail[i - 1] = ptemp[1]; goto again; } } /* * In 32 bit, we may have memory which cannot be mapped directly. * This memory will need temporary mapping before it can be * accessed. */ if (!MIPS_DIRECT_MAPPABLE(phys_avail[i - 1] - 1)) need_local_mappings = 1; /* * Copy the phys_avail[] array before we start stealing memory from it. */ for (i = 0; phys_avail[i + 1] != 0; i += 2) { physmem_desc[i] = phys_avail[i]; physmem_desc[i + 1] = phys_avail[i + 1]; } Maxmem = atop(phys_avail[i - 1]); if (bootverbose) { printf("Physical memory chunk(s):\n"); for (i = 0; phys_avail[i + 1] != 0; i += 2) { vm_paddr_t size; size = phys_avail[i + 1] - phys_avail[i]; printf("%#08jx - %#08jx, %ju bytes (%ju pages)\n", (uintmax_t) phys_avail[i], (uintmax_t) phys_avail[i + 1] - 1, (uintmax_t) size, (uintmax_t) size / PAGE_SIZE); } printf("Maxmem is 0x%0jx\n", ptoa((uintmax_t)Maxmem)); } /* * Steal the message buffer from the beginning of memory. */ msgbufp = (struct msgbuf *)pmap_steal_memory(msgbufsize); msgbufinit(msgbufp, msgbufsize); /* * Steal thread0 kstack. */ kstack0 = pmap_steal_memory(KSTACK_PAGES << PAGE_SHIFT); virtual_avail = VM_MIN_KERNEL_ADDRESS; virtual_end = VM_MAX_KERNEL_ADDRESS; #ifdef SMP /* * Steal some virtual address space to map the pcpu area. */ virtual_avail = roundup2(virtual_avail, PAGE_SIZE * 2); pcpup = (struct pcpu *)virtual_avail; virtual_avail += PAGE_SIZE * 2; /* * Initialize the wired TLB entry mapping the pcpu region for * the BSP at 'pcpup'. Up until this point we were operating * with the 'pcpup' for the BSP pointing to a virtual address * in KSEG0 so there was no need for a TLB mapping. */ mips_pcpu_tlb_init(PCPU_ADDR(0)); if (bootverbose) printf("pcpu is available at virtual address %p.\n", pcpup); #endif if (need_local_mappings) pmap_alloc_lmem_map(); pmap_create_kernel_pagetable(); pmap_max_asid = VMNUM_PIDS; mips_wr_entryhi(0); mips_wr_pagemask(0); /* * Initialize the global pv list lock. */ rw_init(&pvh_global_lock, "pmap pv global"); } /* * Initialize a vm_page's machine-dependent fields. */ void pmap_page_init(vm_page_t m) { TAILQ_INIT(&m->md.pv_list); m->md.pv_flags = 0; } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. */ void pmap_init(void) { } /*************************************************** * Low level helper routines..... ***************************************************/ #ifdef SMP static __inline void pmap_call_on_active_cpus(pmap_t pmap, void (*fn)(void *), void *arg) { int cpuid, cpu, self; cpuset_t active_cpus; sched_pin(); if (is_kernel_pmap(pmap)) { smp_rendezvous(NULL, fn, NULL, arg); goto out; } /* Force ASID update on inactive CPUs */ CPU_FOREACH(cpu) { if (!CPU_ISSET(cpu, &pmap->pm_active)) pmap->pm_asid[cpu].gen = 0; } cpuid = PCPU_GET(cpuid); /* * XXX: barrier/locking for active? * * Take a snapshot of active here, any further changes are ignored. * tlb update/invalidate should be harmless on inactive CPUs */ active_cpus = pmap->pm_active; self = CPU_ISSET(cpuid, &active_cpus); CPU_CLR(cpuid, &active_cpus); /* Optimize for the case where this cpu is the only active one */ if (CPU_EMPTY(&active_cpus)) { if (self) fn(arg); } else { if (self) CPU_SET(cpuid, &active_cpus); smp_rendezvous_cpus(active_cpus, NULL, fn, NULL, arg); } out: sched_unpin(); } #else /* !SMP */ static __inline void pmap_call_on_active_cpus(pmap_t pmap, void (*fn)(void *), void *arg) { int cpuid; if (is_kernel_pmap(pmap)) { fn(arg); return; } cpuid = PCPU_GET(cpuid); if (!CPU_ISSET(cpuid, &pmap->pm_active)) pmap->pm_asid[cpuid].gen = 0; else fn(arg); } #endif /* SMP */ static void pmap_invalidate_all(pmap_t pmap) { pmap_call_on_active_cpus(pmap, (void (*)(void *))tlb_invalidate_all_user, pmap); } struct pmap_invalidate_page_arg { pmap_t pmap; vm_offset_t va; }; static void pmap_invalidate_page_action(void *arg) { struct pmap_invalidate_page_arg *p = arg; tlb_invalidate_address(p->pmap, p->va); } static void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { struct pmap_invalidate_page_arg arg; arg.pmap = pmap; arg.va = va; pmap_call_on_active_cpus(pmap, pmap_invalidate_page_action, &arg); } struct pmap_invalidate_range_arg { pmap_t pmap; vm_offset_t sva; vm_offset_t eva; }; static void pmap_invalidate_range_action(void *arg) { struct pmap_invalidate_range_arg *p = arg; tlb_invalidate_range(p->pmap, p->sva, p->eva); } static void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { struct pmap_invalidate_range_arg arg; arg.pmap = pmap; arg.sva = sva; arg.eva = eva; pmap_call_on_active_cpus(pmap, pmap_invalidate_range_action, &arg); } struct pmap_update_page_arg { pmap_t pmap; vm_offset_t va; pt_entry_t pte; }; static void pmap_update_page_action(void *arg) { struct pmap_update_page_arg *p = arg; tlb_update(p->pmap, p->va, p->pte); } static void pmap_update_page(pmap_t pmap, vm_offset_t va, pt_entry_t pte) { struct pmap_update_page_arg arg; arg.pmap = pmap; arg.va = va; arg.pte = pte; pmap_call_on_active_cpus(pmap, pmap_update_page_action, &arg); } /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ vm_paddr_t pmap_extract(pmap_t pmap, vm_offset_t va) { pt_entry_t *pte; vm_offset_t retval = 0; PMAP_LOCK(pmap); pte = pmap_pte(pmap, va); if (pte) { retval = TLBLO_PTE_TO_PA(*pte) | (va & PAGE_MASK); } PMAP_UNLOCK(pmap); return (retval); } /* * Routine: pmap_extract_and_hold * Function: * Atomically extract and hold the physical page * with the given pmap and virtual address pair * if that mapping permits the given protection. */ vm_page_t pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) { pt_entry_t pte, *ptep; vm_paddr_t pa, pte_pa; vm_page_t m; m = NULL; pa = 0; PMAP_LOCK(pmap); retry: ptep = pmap_pte(pmap, va); if (ptep != NULL) { pte = *ptep; if (pte_test(&pte, PTE_V) && (!pte_test(&pte, PTE_RO) || (prot & VM_PROT_WRITE) == 0)) { pte_pa = TLBLO_PTE_TO_PA(pte); if (vm_page_pa_tryrelock(pmap, pte_pa, &pa)) goto retry; m = PHYS_TO_VM_PAGE(pte_pa); vm_page_hold(m); } } PA_UNLOCK_COND(pa); PMAP_UNLOCK(pmap); return (m); } /*************************************************** * Low level mapping routines..... ***************************************************/ /* * add a wired page to the kva */ void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int attr) { pt_entry_t *pte; pt_entry_t opte, npte; #ifdef PMAP_DEBUG printf("pmap_kenter: va: %p -> pa: %p\n", (void *)va, (void *)pa); #endif pte = pmap_pte(kernel_pmap, va); opte = *pte; npte = TLBLO_PA_TO_PFN(pa) | attr | PTE_D | PTE_V | PTE_G; *pte = npte; if (pte_test(&opte, PTE_V) && opte != npte) pmap_update_page(kernel_pmap, va, npte); } void pmap_kenter(vm_offset_t va, vm_paddr_t pa) { KASSERT(is_cacheable_mem(pa), ("pmap_kenter: memory at 0x%lx is not cacheable", (u_long)pa)); pmap_kenter_attr(va, pa, PTE_C_CACHE); } /* * remove a page from the kernel pagetables */ /* PMAP_INLINE */ void pmap_kremove(vm_offset_t va) { pt_entry_t *pte; /* * Write back all caches from the page being destroyed */ mips_dcache_wbinv_range_index(va, PAGE_SIZE); pte = pmap_pte(kernel_pmap, va); *pte = PTE_G; pmap_invalidate_page(kernel_pmap, va); } /* * Used to map a range of physical addresses into kernel * virtual address space. * * The value passed in '*virt' is a suggested virtual address for * the mapping. Architectures which can support a direct-mapped * physical to virtual region can return the appropriate address * within that region, leaving '*virt' unchanged. Other * architectures should map the pages starting at '*virt' and * update '*virt' with the first usable address after the mapped * region. * * Use XKPHYS for 64 bit, and KSEG0 where possible for 32 bit. */ vm_offset_t pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) { vm_offset_t va, sva; if (MIPS_DIRECT_MAPPABLE(end - 1)) return (MIPS_PHYS_TO_DIRECT(start)); va = sva = *virt; while (start < end) { pmap_kenter(va, start); va += PAGE_SIZE; start += PAGE_SIZE; } *virt = va; return (sva); } /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. */ void pmap_qenter(vm_offset_t va, vm_page_t *m, int count) { int i; vm_offset_t origva = va; for (i = 0; i < count; i++) { pmap_flush_pvcache(m[i]); pmap_kenter(va, VM_PAGE_TO_PHYS(m[i])); va += PAGE_SIZE; } mips_dcache_wbinv_range_index(origva, PAGE_SIZE*count); } /* * this routine jerks page mappings from the * kernel -- it is meant only for temporary mappings. */ void pmap_qremove(vm_offset_t va, int count) { pt_entry_t *pte; vm_offset_t origva; if (count < 1) return; mips_dcache_wbinv_range_index(va, PAGE_SIZE * count); origva = va; do { pte = pmap_pte(kernel_pmap, va); *pte = PTE_G; va += PAGE_SIZE; } while (--count > 0); pmap_invalidate_range(kernel_pmap, origva, va); } /*************************************************** * Page table page management routines..... ***************************************************/ /* * Decrements a page table page's wire count, which is used to record the * number of valid page table entries within the page. If the wire count * drops to zero, then the page table page is unmapped. Returns TRUE if the * page table page was unmapped and FALSE otherwise. */ static PMAP_INLINE boolean_t pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m) { --m->wire_count; if (m->wire_count == 0) { _pmap_unwire_ptp(pmap, va, m); return (TRUE); } else return (FALSE); } static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m) { pd_entry_t *pde; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * unmap the page table page */ #ifdef __mips_n64 if (m->pindex < NUPDE) pde = pmap_pde(pmap, va); else pde = pmap_segmap(pmap, va); #else pde = pmap_pde(pmap, va); #endif *pde = 0; pmap->pm_stats.resident_count--; #ifdef __mips_n64 if (m->pindex < NUPDE) { pd_entry_t *pdp; vm_page_t pdpg; /* * Recursively decrement next level pagetable refcount */ pdp = (pd_entry_t *)*pmap_segmap(pmap, va); pdpg = PHYS_TO_VM_PAGE(MIPS_DIRECT_TO_PHYS(pdp)); pmap_unwire_ptp(pmap, va, pdpg); } #endif /* * If the page is finally unwired, simply free it. */ vm_page_free_zero(m); atomic_subtract_int(&vm_cnt.v_wire_count, 1); } /* * After removing a page table entry, this routine is used to * conditionally free the page, and manage the hold/wire counts. */ static int pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t pde) { vm_page_t mpte; if (va >= VM_MAXUSER_ADDRESS) return (0); KASSERT(pde != 0, ("pmap_unuse_pt: pde != 0")); mpte = PHYS_TO_VM_PAGE(MIPS_DIRECT_TO_PHYS(pde)); return (pmap_unwire_ptp(pmap, va, mpte)); } void pmap_pinit0(pmap_t pmap) { int i; PMAP_LOCK_INIT(pmap); pmap->pm_segtab = kernel_segmap; CPU_ZERO(&pmap->pm_active); for (i = 0; i < MAXCPU; i++) { pmap->pm_asid[i].asid = PMAP_ASID_RESERVED; pmap->pm_asid[i].gen = 0; } PCPU_SET(curpmap, pmap); TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); } -void -pmap_grow_direct_page_cache() +static void +pmap_grow_direct_page(int req) { #ifdef __mips_n64 VM_WAIT; #else - vm_pageout_grow_cache(3, 0, MIPS_KSEG0_LARGEST_PHYS); + if (!vm_page_reclaim_contig(req, 1, 0, MIPS_KSEG0_LARGEST_PHYS, + PAGE_SIZE, 0)) + VM_WAIT; #endif } static vm_page_t pmap_alloc_direct_page(unsigned int index, int req) { vm_page_t m; m = vm_page_alloc_freelist(VM_FREELIST_DIRECT, req | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (m == NULL) return (NULL); if ((m->flags & PG_ZERO) == 0) pmap_zero_page(m); m->pindex = index; return (m); } /* * Initialize a preallocated and zeroed pmap structure, * such as one in a vmspace structure. */ int pmap_pinit(pmap_t pmap) { vm_offset_t ptdva; vm_page_t ptdpg; - int i; + int i, req_class; /* * allocate the page directory page */ - while ((ptdpg = pmap_alloc_direct_page(NUSERPGTBLS, VM_ALLOC_NORMAL)) == NULL) - pmap_grow_direct_page_cache(); + req_class = VM_ALLOC_NORMAL; + while ((ptdpg = pmap_alloc_direct_page(NUSERPGTBLS, req_class)) == + NULL) + pmap_grow_direct_page(req_class); ptdva = MIPS_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(ptdpg)); pmap->pm_segtab = (pd_entry_t *)ptdva; CPU_ZERO(&pmap->pm_active); for (i = 0; i < MAXCPU; i++) { pmap->pm_asid[i].asid = PMAP_ASID_RESERVED; pmap->pm_asid[i].gen = 0; } TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); return (1); } /* * this routine is called if the page table page is not * mapped correctly. */ static vm_page_t _pmap_allocpte(pmap_t pmap, unsigned ptepindex, u_int flags) { vm_offset_t pageva; vm_page_t m; + int req_class; /* * Find or fabricate a new pagetable page */ - if ((m = pmap_alloc_direct_page(ptepindex, VM_ALLOC_NORMAL)) == NULL) { + req_class = VM_ALLOC_NORMAL; + if ((m = pmap_alloc_direct_page(ptepindex, req_class)) == NULL) { if ((flags & PMAP_ENTER_NOSLEEP) == 0) { PMAP_UNLOCK(pmap); rw_wunlock(&pvh_global_lock); - pmap_grow_direct_page_cache(); + pmap_grow_direct_page(req_class); rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); } /* * Indicate the need to retry. While waiting, the page * table page may have been allocated. */ return (NULL); } /* * Map the pagetable page into the process address space, if it * isn't already there. */ pageva = MIPS_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(m)); #ifdef __mips_n64 if (ptepindex >= NUPDE) { pmap->pm_segtab[ptepindex - NUPDE] = (pd_entry_t)pageva; } else { pd_entry_t *pdep, *pde; int segindex = ptepindex >> (SEGSHIFT - PDRSHIFT); int pdeindex = ptepindex & (NPDEPG - 1); vm_page_t pg; pdep = &pmap->pm_segtab[segindex]; if (*pdep == NULL) { /* recurse for allocating page dir */ if (_pmap_allocpte(pmap, NUPDE + segindex, flags) == NULL) { /* alloc failed, release current */ --m->wire_count; atomic_subtract_int(&vm_cnt.v_wire_count, 1); vm_page_free_zero(m); return (NULL); } } else { pg = PHYS_TO_VM_PAGE(MIPS_DIRECT_TO_PHYS(*pdep)); pg->wire_count++; } /* Next level entry */ pde = (pd_entry_t *)*pdep; pde[pdeindex] = (pd_entry_t)pageva; } #else pmap->pm_segtab[ptepindex] = (pd_entry_t)pageva; #endif pmap->pm_stats.resident_count++; return (m); } static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags) { unsigned ptepindex; pd_entry_t *pde; vm_page_t m; /* * Calculate pagetable page index */ ptepindex = pmap_pde_pindex(va); retry: /* * Get the page directory entry */ pde = pmap_pde(pmap, va); /* * If the page table page is mapped, we just increment the hold * count, and activate it. */ if (pde != NULL && *pde != NULL) { m = PHYS_TO_VM_PAGE(MIPS_DIRECT_TO_PHYS(*pde)); m->wire_count++; } else { /* * Here if the pte page isn't mapped, or if it has been * deallocated. */ m = _pmap_allocpte(pmap, ptepindex, flags); if (m == NULL && (flags & PMAP_ENTER_NOSLEEP) == 0) goto retry; } return (m); } /*************************************************** * Pmap allocation/deallocation routines. ***************************************************/ /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap_t pmap) { vm_offset_t ptdva; vm_page_t ptdpg; KASSERT(pmap->pm_stats.resident_count == 0, ("pmap_release: pmap resident count %ld != 0", pmap->pm_stats.resident_count)); ptdva = (vm_offset_t)pmap->pm_segtab; ptdpg = PHYS_TO_VM_PAGE(MIPS_DIRECT_TO_PHYS(ptdva)); ptdpg->wire_count--; atomic_subtract_int(&vm_cnt.v_wire_count, 1); vm_page_free_zero(ptdpg); } /* * grow the number of kernel page table entries, if needed */ void pmap_growkernel(vm_offset_t addr) { vm_page_t nkpg; pd_entry_t *pde, *pdpe; pt_entry_t *pte; - int i; + int i, req_class; mtx_assert(&kernel_map->system_mtx, MA_OWNED); + req_class = VM_ALLOC_INTERRUPT; addr = roundup2(addr, NBSEG); if (addr - 1 >= kernel_map->max_offset) addr = kernel_map->max_offset; while (kernel_vm_end < addr) { pdpe = pmap_segmap(kernel_pmap, kernel_vm_end); #ifdef __mips_n64 if (*pdpe == 0) { /* new intermediate page table entry */ - nkpg = pmap_alloc_direct_page(nkpt, VM_ALLOC_INTERRUPT); + nkpg = pmap_alloc_direct_page(nkpt, req_class); if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); *pdpe = (pd_entry_t)MIPS_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(nkpg)); continue; /* try again */ } #endif pde = pmap_pdpe_to_pde(pdpe, kernel_vm_end); if (*pde != 0) { kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; if (kernel_vm_end - 1 >= kernel_map->max_offset) { kernel_vm_end = kernel_map->max_offset; break; } continue; } /* * This index is bogus, but out of the way */ - nkpg = pmap_alloc_direct_page(nkpt, VM_ALLOC_INTERRUPT); - if (!nkpg) + nkpg = pmap_alloc_direct_page(nkpt, req_class); +#ifndef __mips_n64 + if (nkpg == NULL && vm_page_reclaim_contig(req_class, 1, + 0, MIPS_KSEG0_LARGEST_PHYS, PAGE_SIZE, 0)) + nkpg = pmap_alloc_direct_page(nkpt, req_class); +#endif + if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); nkpt++; *pde = (pd_entry_t)MIPS_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(nkpg)); /* * The R[4-7]?00 stores only one copy of the Global bit in * the translation lookaside buffer for each 2 page entry. * Thus invalid entrys must have the Global bit set so when * Entry LO and Entry HI G bits are anded together they will * produce a global bit to store in the tlb. */ pte = (pt_entry_t *)*pde; for (i = 0; i < NPTEPG; i++) pte[i] = PTE_G; kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; if (kernel_vm_end - 1 >= kernel_map->max_offset) { kernel_vm_end = kernel_map->max_offset; break; } } } /*************************************************** * page management routines. ***************************************************/ CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); #ifdef __mips_n64 CTASSERT(_NPCM == 3); CTASSERT(_NPCPV == 168); #else CTASSERT(_NPCM == 11); CTASSERT(_NPCPV == 336); #endif static __inline struct pv_chunk * pv_to_chunk(pv_entry_t pv) { return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); } #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) #ifdef __mips_n64 #define PC_FREE0_1 0xfffffffffffffffful #define PC_FREE2 0x000000fffffffffful #else #define PC_FREE0_9 0xfffffffful /* Free values for index 0 through 9 */ #define PC_FREE10 0x0000fffful /* Free values for index 10 */ #endif static const u_long pc_freemask[_NPCM] = { #ifdef __mips_n64 PC_FREE0_1, PC_FREE0_1, PC_FREE2 #else PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE10 #endif }; static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, "Current number of pv entries"); #ifdef PV_STATS static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, "Current number of pv entry chunks"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, "Current number of pv entry chunks allocated"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, "Current number of pv entry chunks frees"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, "Number of times tried to get a chunk page but failed."); static long pv_entry_frees, pv_entry_allocs; static int pv_entry_spare; SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, "Current number of pv entry frees"); SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, "Current number of pv entry allocs"); SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, "Current number of spare pv entries"); #endif /* * We are in a serious low memory condition. Resort to * drastic measures to free some pages so we can allocate * another pv entry chunk. */ static vm_page_t pmap_pv_reclaim(pmap_t locked_pmap) { struct pch newtail; struct pv_chunk *pc; pd_entry_t *pde; pmap_t pmap; pt_entry_t *pte, oldpte; pv_entry_t pv; vm_offset_t va; vm_page_t m, m_pc; u_long inuse; int bit, field, freed, idx; PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); pmap = NULL; m_pc = NULL; TAILQ_INIT(&newtail); while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL) { TAILQ_REMOVE(&pv_chunks, pc, pc_lru); if (pmap != pc->pc_pmap) { if (pmap != NULL) { pmap_invalidate_all(pmap); if (pmap != locked_pmap) PMAP_UNLOCK(pmap); } pmap = pc->pc_pmap; /* Avoid deadlock and lock recursion. */ if (pmap > locked_pmap) PMAP_LOCK(pmap); else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) { pmap = NULL; TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); continue; } } /* * Destroy every non-wired, 4 KB page mapping in the chunk. */ freed = 0; for (field = 0; field < _NPCM; field++) { for (inuse = ~pc->pc_map[field] & pc_freemask[field]; inuse != 0; inuse &= ~(1UL << bit)) { bit = ffsl(inuse) - 1; idx = field * sizeof(inuse) * NBBY + bit; pv = &pc->pc_pventry[idx]; va = pv->pv_va; pde = pmap_pde(pmap, va); KASSERT(pde != NULL && *pde != 0, ("pmap_pv_reclaim: pde")); pte = pmap_pde_to_pte(pde, va); oldpte = *pte; if (pte_test(&oldpte, PTE_W)) continue; if (is_kernel_pmap(pmap)) *pte = PTE_G; else *pte = 0; m = PHYS_TO_VM_PAGE(TLBLO_PTE_TO_PA(oldpte)); if (pte_test(&oldpte, PTE_D)) vm_page_dirty(m); if (m->md.pv_flags & PV_TABLE_REF) vm_page_aflag_set(m, PGA_REFERENCED); m->md.pv_flags &= ~PV_TABLE_REF; TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); if (TAILQ_EMPTY(&m->md.pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); pc->pc_map[field] |= 1UL << bit; pmap_unuse_pt(pmap, va, *pde); freed++; } } if (freed == 0) { TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); continue; } /* Every freed mapping is for a 4 KB page. */ pmap->pm_stats.resident_count -= freed; PV_STAT(pv_entry_frees += freed); PV_STAT(pv_entry_spare += freed); pv_entry_count -= freed; TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); for (field = 0; field < _NPCM; field++) if (pc->pc_map[field] != pc_freemask[field]) { TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); /* * One freed pv entry in locked_pmap is * sufficient. */ if (pmap == locked_pmap) goto out; break; } if (field == _NPCM) { PV_STAT(pv_entry_spare -= _NPCPV); PV_STAT(pc_chunk_count--); PV_STAT(pc_chunk_frees++); /* Entire chunk is free; return it. */ m_pc = PHYS_TO_VM_PAGE(MIPS_DIRECT_TO_PHYS( (vm_offset_t)pc)); break; } } out: TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru); if (pmap != NULL) { pmap_invalidate_all(pmap); if (pmap != locked_pmap) PMAP_UNLOCK(pmap); } return (m_pc); } /* * free the pv_entry back to the free list */ static void free_pv_entry(pmap_t pmap, pv_entry_t pv) { struct pv_chunk *pc; int bit, field, idx; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(pv_entry_frees++); PV_STAT(pv_entry_spare++); pv_entry_count--; pc = pv_to_chunk(pv); idx = pv - &pc->pc_pventry[0]; field = idx / (sizeof(u_long) * NBBY); bit = idx % (sizeof(u_long) * NBBY); pc->pc_map[field] |= 1ul << bit; for (idx = 0; idx < _NPCM; idx++) if (pc->pc_map[idx] != pc_freemask[idx]) { /* * 98% of the time, pc is already at the head of the * list. If it isn't already, move it to the head. */ if (__predict_false(TAILQ_FIRST(&pmap->pm_pvchunk) != pc)) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); } return; } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } static void free_pv_chunk(struct pv_chunk *pc) { vm_page_t m; TAILQ_REMOVE(&pv_chunks, pc, pc_lru); PV_STAT(pv_entry_spare -= _NPCPV); PV_STAT(pc_chunk_count--); PV_STAT(pc_chunk_frees++); /* entire chunk is free, return it */ m = PHYS_TO_VM_PAGE(MIPS_DIRECT_TO_PHYS((vm_offset_t)pc)); vm_page_unwire(m, PQ_NONE); vm_page_free(m); } /* * get a new pv_entry, allocating a block from the system * when needed. */ static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try) { struct pv_chunk *pc; pv_entry_t pv; vm_page_t m; int bit, field, idx; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(pv_entry_allocs++); pv_entry_count++; retry: pc = TAILQ_FIRST(&pmap->pm_pvchunk); if (pc != NULL) { for (field = 0; field < _NPCM; field++) { if (pc->pc_map[field]) { bit = ffsl(pc->pc_map[field]) - 1; break; } } if (field < _NPCM) { idx = field * sizeof(pc->pc_map[field]) * NBBY + bit; pv = &pc->pc_pventry[idx]; pc->pc_map[field] &= ~(1ul << bit); /* If this was the last item, move it to tail */ for (field = 0; field < _NPCM; field++) if (pc->pc_map[field] != 0) { PV_STAT(pv_entry_spare--); return (pv); /* not full, return */ } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(pv_entry_spare--); return (pv); } } /* No free items, allocate another chunk */ m = vm_page_alloc_freelist(VM_FREELIST_DIRECT, VM_ALLOC_NORMAL | VM_ALLOC_WIRED); if (m == NULL) { if (try) { pv_entry_count--; PV_STAT(pc_chunk_tryfail++); return (NULL); } m = pmap_pv_reclaim(pmap); if (m == NULL) goto retry; } PV_STAT(pc_chunk_count++); PV_STAT(pc_chunk_allocs++); pc = (struct pv_chunk *)MIPS_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(m)); pc->pc_pmap = pmap; pc->pc_map[0] = pc_freemask[0] & ~1ul; /* preallocated bit 0 */ for (field = 1; field < _NPCM; field++) pc->pc_map[field] = pc_freemask[field]; TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); pv = &pc->pc_pventry[0]; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(pv_entry_spare += _NPCPV - 1); return (pv); } static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) { if (pmap == PV_PMAP(pv) && va == pv->pv_va) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_list); break; } } return (pv); } static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pvh_free: pv not found, pa %lx va %lx", (u_long)VM_PAGE_TO_PHYS(__containerof(pvh, struct vm_page, md)), (u_long)va)); free_pv_entry(pmap, pv); } static void pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) { rw_assert(&pvh_global_lock, RA_WLOCKED); pmap_pvh_free(&m->md, pmap, va); if (TAILQ_EMPTY(&m->md.pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } /* * Conditionally create a pv entry. */ static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_page_t mpte, vm_offset_t va, vm_page_t m) { pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); if ((pv = get_pv_entry(pmap, TRUE)) != NULL) { pv->pv_va = va; TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); return (TRUE); } else return (FALSE); } /* * pmap_remove_pte: do the things to unmap a page in a process */ static int pmap_remove_pte(struct pmap *pmap, pt_entry_t *ptq, vm_offset_t va, pd_entry_t pde) { pt_entry_t oldpte; vm_page_t m; vm_paddr_t pa; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * Write back all cache lines from the page being unmapped. */ mips_dcache_wbinv_range_index(va, PAGE_SIZE); oldpte = *ptq; if (is_kernel_pmap(pmap)) *ptq = PTE_G; else *ptq = 0; if (pte_test(&oldpte, PTE_W)) pmap->pm_stats.wired_count -= 1; pmap->pm_stats.resident_count -= 1; if (pte_test(&oldpte, PTE_MANAGED)) { pa = TLBLO_PTE_TO_PA(oldpte); m = PHYS_TO_VM_PAGE(pa); if (pte_test(&oldpte, PTE_D)) { KASSERT(!pte_test(&oldpte, PTE_RO), ("%s: modified page not writable: va: %p, pte: %#jx", __func__, (void *)va, (uintmax_t)oldpte)); vm_page_dirty(m); } if (m->md.pv_flags & PV_TABLE_REF) vm_page_aflag_set(m, PGA_REFERENCED); m->md.pv_flags &= ~PV_TABLE_REF; pmap_remove_entry(pmap, m, va); } return (pmap_unuse_pt(pmap, va, pde)); } /* * Remove a single page from a process address space */ static void pmap_remove_page(struct pmap *pmap, vm_offset_t va) { pd_entry_t *pde; pt_entry_t *ptq; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); pde = pmap_pde(pmap, va); if (pde == NULL || *pde == 0) return; ptq = pmap_pde_to_pte(pde, va); /* * If there is no pte for this address, just skip it! */ if (!pte_test(ptq, PTE_V)) return; (void)pmap_remove_pte(pmap, ptq, va, *pde); pmap_invalidate_page(pmap, va); } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ void pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { pd_entry_t *pde, *pdpe; pt_entry_t *pte; vm_offset_t va, va_next; /* * Perform an unsynchronized read. This is, however, safe. */ if (pmap->pm_stats.resident_count == 0) return; rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); /* * special handling of removing one page. a very common operation * and easy to short circuit some code. */ if ((sva + PAGE_SIZE) == eva) { pmap_remove_page(pmap, sva); goto out; } for (; sva < eva; sva = va_next) { pdpe = pmap_segmap(pmap, sva); #ifdef __mips_n64 if (*pdpe == 0) { va_next = (sva + NBSEG) & ~SEGMASK; if (va_next < sva) va_next = eva; continue; } #endif va_next = (sva + NBPDR) & ~PDRMASK; if (va_next < sva) va_next = eva; pde = pmap_pdpe_to_pde(pdpe, sva); if (*pde == NULL) continue; /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being removed. */ if (va_next > eva) va_next = eva; va = va_next; for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, sva += PAGE_SIZE) { if (!pte_test(pte, PTE_V)) { if (va != va_next) { pmap_invalidate_range(pmap, va, sva); va = va_next; } continue; } if (va == va_next) va = sva; if (pmap_remove_pte(pmap, pte, sva, *pde)) { sva += PAGE_SIZE; break; } } if (va != va_next) pmap_invalidate_range(pmap, va, sva); } out: rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ void pmap_remove_all(vm_page_t m) { pv_entry_t pv; pmap_t pmap; pd_entry_t *pde; pt_entry_t *pte, tpte; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_all: page %p is not managed", m)); rw_wlock(&pvh_global_lock); if (m->md.pv_flags & PV_TABLE_REF) vm_page_aflag_set(m, PGA_REFERENCED); while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); /* * If it's last mapping writeback all caches from * the page being destroyed */ if (TAILQ_NEXT(pv, pv_list) == NULL) mips_dcache_wbinv_range_index(pv->pv_va, PAGE_SIZE); pmap->pm_stats.resident_count--; pde = pmap_pde(pmap, pv->pv_va); KASSERT(pde != NULL && *pde != 0, ("pmap_remove_all: pde")); pte = pmap_pde_to_pte(pde, pv->pv_va); tpte = *pte; if (is_kernel_pmap(pmap)) *pte = PTE_G; else *pte = 0; if (pte_test(&tpte, PTE_W)) pmap->pm_stats.wired_count--; /* * Update the vm_page_t clean and reference bits. */ if (pte_test(&tpte, PTE_D)) { KASSERT(!pte_test(&tpte, PTE_RO), ("%s: modified page not writable: va: %p, pte: %#jx", __func__, (void *)pv->pv_va, (uintmax_t)tpte)); vm_page_dirty(m); } pmap_invalidate_page(pmap, pv->pv_va); TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); pmap_unuse_pt(pmap, pv->pv_va, *pde); free_pv_entry(pmap, pv); PMAP_UNLOCK(pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); m->md.pv_flags &= ~PV_TABLE_REF; rw_wunlock(&pvh_global_lock); } /* * Set the physical protection on the * specified range of this map as requested. */ void pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { pt_entry_t pbits, *pte; pd_entry_t *pde, *pdpe; vm_offset_t va, va_next; vm_paddr_t pa; vm_page_t m; if ((prot & VM_PROT_READ) == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } if (prot & VM_PROT_WRITE) return; PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { pdpe = pmap_segmap(pmap, sva); #ifdef __mips_n64 if (*pdpe == 0) { va_next = (sva + NBSEG) & ~SEGMASK; if (va_next < sva) va_next = eva; continue; } #endif va_next = (sva + NBPDR) & ~PDRMASK; if (va_next < sva) va_next = eva; pde = pmap_pdpe_to_pde(pdpe, sva); if (*pde == NULL) continue; /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being write protected. */ if (va_next > eva) va_next = eva; va = va_next; for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, sva += PAGE_SIZE) { pbits = *pte; if (!pte_test(&pbits, PTE_V) || pte_test(&pbits, PTE_RO)) { if (va != va_next) { pmap_invalidate_range(pmap, va, sva); va = va_next; } continue; } pte_set(&pbits, PTE_RO); if (pte_test(&pbits, PTE_D)) { pte_clear(&pbits, PTE_D); if (pte_test(&pbits, PTE_MANAGED)) { pa = TLBLO_PTE_TO_PA(pbits); m = PHYS_TO_VM_PAGE(pa); vm_page_dirty(m); } if (va == va_next) va = sva; } else { /* * Unless PTE_D is set, any TLB entries * mapping "sva" don't allow write access, so * they needn't be invalidated. */ if (va != va_next) { pmap_invalidate_range(pmap, va, sva); va = va_next; } } *pte = pbits; } if (va != va_next) pmap_invalidate_range(pmap, va, sva); } PMAP_UNLOCK(pmap); } /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. */ int pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind __unused) { vm_paddr_t pa, opa; pt_entry_t *pte; pt_entry_t origpte, newpte; pv_entry_t pv; vm_page_t mpte, om; va &= ~PAGE_MASK; KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); KASSERT((m->oflags & VPO_UNMANAGED) != 0 || va < kmi.clean_sva || va >= kmi.clean_eva, ("pmap_enter: managed mapping within the clean submap")); if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) VM_OBJECT_ASSERT_LOCKED(m->object); pa = VM_PAGE_TO_PHYS(m); newpte = TLBLO_PA_TO_PFN(pa) | init_pte_prot(m, flags, prot); if ((flags & PMAP_ENTER_WIRED) != 0) newpte |= PTE_W; if (is_kernel_pmap(pmap)) newpte |= PTE_G; if (is_cacheable_page(pa, m)) newpte |= PTE_C_CACHE; else newpte |= PTE_C_UNCACHED; mpte = NULL; rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); /* * In the case that a page table page is not resident, we are * creating it here. */ if (va < VM_MAXUSER_ADDRESS) { mpte = pmap_allocpte(pmap, va, flags); if (mpte == NULL) { KASSERT((flags & PMAP_ENTER_NOSLEEP) != 0, ("pmap_allocpte failed with sleep allowed")); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); return (KERN_RESOURCE_SHORTAGE); } } pte = pmap_pte(pmap, va); /* * Page Directory table entry not valid, we need a new PT page */ if (pte == NULL) { panic("pmap_enter: invalid page directory, pdir=%p, va=%p", (void *)pmap->pm_segtab, (void *)va); } om = NULL; origpte = *pte; opa = TLBLO_PTE_TO_PA(origpte); /* * Mapping has not changed, must be protection or wiring change. */ if (pte_test(&origpte, PTE_V) && opa == pa) { /* * Wiring change, just update stats. We don't worry about * wiring PT pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is * wired, the PT page will be also. */ if (pte_test(&newpte, PTE_W) && !pte_test(&origpte, PTE_W)) pmap->pm_stats.wired_count++; else if (!pte_test(&newpte, PTE_W) && pte_test(&origpte, PTE_W)) pmap->pm_stats.wired_count--; KASSERT(!pte_test(&origpte, PTE_D | PTE_RO), ("%s: modified page not writable: va: %p, pte: %#jx", __func__, (void *)va, (uintmax_t)origpte)); /* * Remove extra pte reference */ if (mpte) mpte->wire_count--; if (pte_test(&origpte, PTE_MANAGED)) { m->md.pv_flags |= PV_TABLE_REF; om = m; newpte |= PTE_MANAGED; if (!pte_test(&newpte, PTE_RO)) vm_page_aflag_set(m, PGA_WRITEABLE); } goto validate; } pv = NULL; /* * Mapping has changed, invalidate old range and fall through to * handle validating new mapping. */ if (opa) { if (pte_test(&origpte, PTE_W)) pmap->pm_stats.wired_count--; if (pte_test(&origpte, PTE_MANAGED)) { om = PHYS_TO_VM_PAGE(opa); pv = pmap_pvh_remove(&om->md, pmap, va); } if (mpte != NULL) { mpte->wire_count--; KASSERT(mpte->wire_count > 0, ("pmap_enter: missing reference to page table page," " va: %p", (void *)va)); } } else pmap->pm_stats.resident_count++; /* * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0) { m->md.pv_flags |= PV_TABLE_REF; if (pv == NULL) pv = get_pv_entry(pmap, FALSE); pv->pv_va = va; TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); newpte |= PTE_MANAGED; if (!pte_test(&newpte, PTE_RO)) vm_page_aflag_set(m, PGA_WRITEABLE); } else if (pv != NULL) free_pv_entry(pmap, pv); /* * Increment counters */ if (pte_test(&newpte, PTE_W)) pmap->pm_stats.wired_count++; validate: #ifdef PMAP_DEBUG printf("pmap_enter: va: %p -> pa: %p\n", (void *)va, (void *)pa); #endif /* * if the mapping or permission bits are different, we need to * update the pte. */ if (origpte != newpte) { *pte = newpte; if (pte_test(&origpte, PTE_V)) { if (pte_test(&origpte, PTE_MANAGED) && opa != pa) { if (om->md.pv_flags & PV_TABLE_REF) vm_page_aflag_set(om, PGA_REFERENCED); om->md.pv_flags &= ~PV_TABLE_REF; } if (pte_test(&origpte, PTE_D)) { KASSERT(!pte_test(&origpte, PTE_RO), ("pmap_enter: modified page not writable:" " va: %p, pte: %#jx", (void *)va, (uintmax_t)origpte)); if (pte_test(&origpte, PTE_MANAGED)) vm_page_dirty(om); } if (pte_test(&origpte, PTE_MANAGED) && TAILQ_EMPTY(&om->md.pv_list)) vm_page_aflag_clear(om, PGA_WRITEABLE); pmap_update_page(pmap, va, newpte); } } /* * Sync I & D caches for executable pages. Do this only if the * target pmap belongs to the current process. Otherwise, an * unresolvable TLB miss may occur. */ if (!is_kernel_pmap(pmap) && (pmap == &curproc->p_vmspace->vm_pmap) && (prot & VM_PROT_EXECUTE)) { mips_icache_sync_range(va, PAGE_SIZE); mips_dcache_wbinv_range(va, PAGE_SIZE); } rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); return (KERN_SUCCESS); } /* * this code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No page table pages. * but is *MUCH* faster than pmap_enter... */ void pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte) { pt_entry_t *pte; vm_paddr_t pa; KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || (m->oflags & VPO_UNMANAGED) != 0, ("pmap_enter_quick_locked: managed mapping within the clean submap")); rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * In the case that a page table page is not resident, we are * creating it here. */ if (va < VM_MAXUSER_ADDRESS) { pd_entry_t *pde; unsigned ptepindex; /* * Calculate pagetable page index */ ptepindex = pmap_pde_pindex(va); if (mpte && (mpte->pindex == ptepindex)) { mpte->wire_count++; } else { /* * Get the page directory entry */ pde = pmap_pde(pmap, va); /* * If the page table page is mapped, we just * increment the hold count, and activate it. */ if (pde && *pde != 0) { mpte = PHYS_TO_VM_PAGE( MIPS_DIRECT_TO_PHYS(*pde)); mpte->wire_count++; } else { mpte = _pmap_allocpte(pmap, ptepindex, PMAP_ENTER_NOSLEEP); if (mpte == NULL) return (mpte); } } } else { mpte = NULL; } pte = pmap_pte(pmap, va); if (pte_test(pte, PTE_V)) { if (mpte != NULL) { mpte->wire_count--; mpte = NULL; } return (mpte); } /* * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0 && !pmap_try_insert_pv_entry(pmap, mpte, va, m)) { if (mpte != NULL) { pmap_unwire_ptp(pmap, va, mpte); mpte = NULL; } return (mpte); } /* * Increment counters */ pmap->pm_stats.resident_count++; pa = VM_PAGE_TO_PHYS(m); /* * Now validate mapping with RO protection */ *pte = PTE_RO | TLBLO_PA_TO_PFN(pa) | PTE_V; if ((m->oflags & VPO_UNMANAGED) == 0) *pte |= PTE_MANAGED; if (is_cacheable_page(pa, m)) *pte |= PTE_C_CACHE; else *pte |= PTE_C_UNCACHED; if (is_kernel_pmap(pmap)) *pte |= PTE_G; else { /* * Sync I & D caches. Do this only if the target pmap * belongs to the current process. Otherwise, an * unresolvable TLB miss may occur. */ if (pmap == &curproc->p_vmspace->vm_pmap) { va &= ~PAGE_MASK; mips_icache_sync_range(va, PAGE_SIZE); mips_dcache_wbinv_range(va, PAGE_SIZE); } } return (mpte); } /* * Make a temporary mapping for a physical address. This is only intended * to be used for panic dumps. * * Use XKPHYS for 64 bit, and KSEG0 where possible for 32 bit. */ void * pmap_kenter_temporary(vm_paddr_t pa, int i) { vm_offset_t va; if (i != 0) printf("%s: ERROR!!! More than one page of virtual address mapping not supported\n", __func__); if (MIPS_DIRECT_MAPPABLE(pa)) { va = MIPS_PHYS_TO_DIRECT(pa); } else { #ifndef __mips_n64 /* XXX : to be converted to new style */ int cpu; register_t intr; struct local_sysmaps *sysm; pt_entry_t *pte, npte; /* If this is used other than for dumps, we may need to leave * interrupts disasbled on return. If crash dumps don't work when * we get to this point, we might want to consider this (leaving things * disabled as a starting point ;-) */ intr = intr_disable(); cpu = PCPU_GET(cpuid); sysm = &sysmap_lmem[cpu]; /* Since this is for the debugger, no locks or any other fun */ npte = TLBLO_PA_TO_PFN(pa) | PTE_C_CACHE | PTE_D | PTE_V | PTE_G; pte = pmap_pte(kernel_pmap, sysm->base); *pte = npte; sysm->valid1 = 1; pmap_update_page(kernel_pmap, sysm->base, npte); va = sysm->base; intr_restore(intr); #endif } return ((void *)va); } void pmap_kenter_temporary_free(vm_paddr_t pa) { #ifndef __mips_n64 /* XXX : to be converted to new style */ int cpu; register_t intr; struct local_sysmaps *sysm; #endif if (MIPS_DIRECT_MAPPABLE(pa)) { /* nothing to do for this case */ return; } #ifndef __mips_n64 /* XXX : to be converted to new style */ cpu = PCPU_GET(cpuid); sysm = &sysmap_lmem[cpu]; if (sysm->valid1) { pt_entry_t *pte; intr = intr_disable(); pte = pmap_pte(kernel_pmap, sysm->base); *pte = PTE_G; pmap_invalidate_page(kernel_pmap, sysm->base); intr_restore(intr); sysm->valid1 = 0; } #endif } /* * Maps a sequence of resident pages belonging to the same object. * The sequence begins with the given page m_start. This page is * mapped at the given virtual address start. Each subsequent page is * mapped at a virtual address that is offset from start by the same * amount as the page is offset from m_start within the object. The * last page in the sequence is the page with the largest offset from * m_start that can be mapped at a virtual address less than the given * virtual address end. Not every virtual page between start and end * is mapped; only those for which a resident page exists with the * corresponding offset from m_start are mapped. */ void pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { vm_page_t m, mpte; vm_pindex_t diff, psize; VM_OBJECT_ASSERT_LOCKED(m_start->object); psize = atop(end - start); mpte = NULL; m = m_start; rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { mpte = pmap_enter_quick_locked(pmap, start + ptoa(diff), m, prot, mpte); m = TAILQ_NEXT(m, listq); } rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } /* * pmap_object_init_pt preloads the ptes for a given object * into the specified pmap. This eliminates the blast of soft * faults on process startup and immediately after an mmap. */ void pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size) { VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, ("pmap_object_init_pt: non-device object")); } /* * Clear the wired attribute from the mappings for the specified range of * addresses in the given pmap. Every valid mapping within that range * must have the wired attribute set. In contrast, invalid mappings * cannot have the wired attribute set, so they are ignored. * * The wired attribute of the page table entry is not a hardware feature, * so there is no need to invalidate any TLB entries. */ void pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { pd_entry_t *pde, *pdpe; pt_entry_t *pte; vm_offset_t va_next; PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { pdpe = pmap_segmap(pmap, sva); #ifdef __mips_n64 if (*pdpe == NULL) { va_next = (sva + NBSEG) & ~SEGMASK; if (va_next < sva) va_next = eva; continue; } #endif va_next = (sva + NBPDR) & ~PDRMASK; if (va_next < sva) va_next = eva; pde = pmap_pdpe_to_pde(pdpe, sva); if (*pde == NULL) continue; if (va_next > eva) va_next = eva; for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, sva += PAGE_SIZE) { if (!pte_test(pte, PTE_V)) continue; if (!pte_test(pte, PTE_W)) panic("pmap_unwire: pte %#jx is missing PG_W", (uintmax_t)*pte); pte_clear(pte, PTE_W); pmap->pm_stats.wired_count--; } } PMAP_UNLOCK(pmap); } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. */ void pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { } /* * pmap_zero_page zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. * * Use XKPHYS for 64 bit, and KSEG0 where possible for 32 bit. */ void pmap_zero_page(vm_page_t m) { vm_offset_t va; vm_paddr_t phys = VM_PAGE_TO_PHYS(m); if (MIPS_DIRECT_MAPPABLE(phys)) { va = MIPS_PHYS_TO_DIRECT(phys); bzero((caddr_t)va, PAGE_SIZE); mips_dcache_wbinv_range(va, PAGE_SIZE); } else { va = pmap_lmem_map1(phys); bzero((caddr_t)va, PAGE_SIZE); mips_dcache_wbinv_range(va, PAGE_SIZE); pmap_lmem_unmap(); } } /* * pmap_zero_page_area zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. * * off and size may not cover an area beyond a single hardware page. */ void pmap_zero_page_area(vm_page_t m, int off, int size) { vm_offset_t va; vm_paddr_t phys = VM_PAGE_TO_PHYS(m); if (MIPS_DIRECT_MAPPABLE(phys)) { va = MIPS_PHYS_TO_DIRECT(phys); bzero((char *)(caddr_t)va + off, size); mips_dcache_wbinv_range(va + off, size); } else { va = pmap_lmem_map1(phys); bzero((char *)va + off, size); mips_dcache_wbinv_range(va + off, size); pmap_lmem_unmap(); } } void pmap_zero_page_idle(vm_page_t m) { vm_offset_t va; vm_paddr_t phys = VM_PAGE_TO_PHYS(m); if (MIPS_DIRECT_MAPPABLE(phys)) { va = MIPS_PHYS_TO_DIRECT(phys); bzero((caddr_t)va, PAGE_SIZE); mips_dcache_wbinv_range(va, PAGE_SIZE); } else { va = pmap_lmem_map1(phys); bzero((caddr_t)va, PAGE_SIZE); mips_dcache_wbinv_range(va, PAGE_SIZE); pmap_lmem_unmap(); } } /* * pmap_copy_page copies the specified (machine independent) * page by mapping the page into virtual memory and using * bcopy to copy the page, one machine dependent page at a * time. * * Use XKPHYS for 64 bit, and KSEG0 where possible for 32 bit. */ void pmap_copy_page(vm_page_t src, vm_page_t dst) { vm_offset_t va_src, va_dst; vm_paddr_t phys_src = VM_PAGE_TO_PHYS(src); vm_paddr_t phys_dst = VM_PAGE_TO_PHYS(dst); if (MIPS_DIRECT_MAPPABLE(phys_src) && MIPS_DIRECT_MAPPABLE(phys_dst)) { /* easy case, all can be accessed via KSEG0 */ /* * Flush all caches for VA that are mapped to this page * to make sure that data in SDRAM is up to date */ pmap_flush_pvcache(src); mips_dcache_wbinv_range_index( MIPS_PHYS_TO_DIRECT(phys_dst), PAGE_SIZE); va_src = MIPS_PHYS_TO_DIRECT(phys_src); va_dst = MIPS_PHYS_TO_DIRECT(phys_dst); bcopy((caddr_t)va_src, (caddr_t)va_dst, PAGE_SIZE); mips_dcache_wbinv_range(va_dst, PAGE_SIZE); } else { va_src = pmap_lmem_map2(phys_src, phys_dst); va_dst = va_src + PAGE_SIZE; bcopy((void *)va_src, (void *)va_dst, PAGE_SIZE); mips_dcache_wbinv_range(va_dst, PAGE_SIZE); pmap_lmem_unmap(); } } int unmapped_buf_allowed; void pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], vm_offset_t b_offset, int xfersize) { char *a_cp, *b_cp; vm_page_t a_m, b_m; vm_offset_t a_pg_offset, b_pg_offset; vm_paddr_t a_phys, b_phys; int cnt; while (xfersize > 0) { a_pg_offset = a_offset & PAGE_MASK; cnt = min(xfersize, PAGE_SIZE - a_pg_offset); a_m = ma[a_offset >> PAGE_SHIFT]; a_phys = VM_PAGE_TO_PHYS(a_m); b_pg_offset = b_offset & PAGE_MASK; cnt = min(cnt, PAGE_SIZE - b_pg_offset); b_m = mb[b_offset >> PAGE_SHIFT]; b_phys = VM_PAGE_TO_PHYS(b_m); if (MIPS_DIRECT_MAPPABLE(a_phys) && MIPS_DIRECT_MAPPABLE(b_phys)) { pmap_flush_pvcache(a_m); mips_dcache_wbinv_range_index( MIPS_PHYS_TO_DIRECT(b_phys), PAGE_SIZE); a_cp = (char *)MIPS_PHYS_TO_DIRECT(a_phys) + a_pg_offset; b_cp = (char *)MIPS_PHYS_TO_DIRECT(b_phys) + b_pg_offset; bcopy(a_cp, b_cp, cnt); mips_dcache_wbinv_range((vm_offset_t)b_cp, cnt); } else { a_cp = (char *)pmap_lmem_map2(a_phys, b_phys); b_cp = (char *)a_cp + PAGE_SIZE; a_cp += a_pg_offset; b_cp += b_pg_offset; bcopy(a_cp, b_cp, cnt); mips_dcache_wbinv_range((vm_offset_t)b_cp, cnt); pmap_lmem_unmap(); } a_offset += cnt; b_offset += cnt; xfersize -= cnt; } } vm_offset_t pmap_quick_enter_page(vm_page_t m) { #if defined(__mips_n64) return MIPS_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(m)); #else vm_paddr_t pa; struct local_sysmaps *sysm; pt_entry_t *pte; pa = VM_PAGE_TO_PHYS(m); if (MIPS_DIRECT_MAPPABLE(pa)) { if (m->md.pv_flags & PV_MEMATTR_UNCACHEABLE) return (MIPS_PHYS_TO_DIRECT_UNCACHED(pa)); else return (MIPS_PHYS_TO_DIRECT(pa)); } critical_enter(); sysm = &sysmap_lmem[PCPU_GET(cpuid)]; KASSERT(sysm->valid1 == 0, ("pmap_quick_enter_page: PTE busy")); pte = pmap_pte(kernel_pmap, sysm->base); *pte = TLBLO_PA_TO_PFN(pa) | PTE_D | PTE_V | PTE_G | (is_cacheable_page(pa, m) ? PTE_C_CACHE : PTE_C_UNCACHED); sysm->valid1 = 1; return (sysm->base); #endif } void pmap_quick_remove_page(vm_offset_t addr) { mips_dcache_wbinv_range(addr, PAGE_SIZE); #if !defined(__mips_n64) struct local_sysmaps *sysm; pt_entry_t *pte; if (addr >= MIPS_KSEG0_START && addr < MIPS_KSEG0_END) return; sysm = &sysmap_lmem[PCPU_GET(cpuid)]; KASSERT(sysm->valid1 != 0, ("pmap_quick_remove_page: PTE not in use")); KASSERT(sysm->base == addr, ("pmap_quick_remove_page: invalid address")); pte = pmap_pte(kernel_pmap, addr); *pte = PTE_G; tlb_invalidate_address(kernel_pmap, addr); sysm->valid1 = 0; critical_exit(); #endif } /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may * be changed upwards or downwards in the future; it * is only necessary that true be returned for a small * subset of pmaps for proper page aging. */ boolean_t pmap_page_exists_quick(pmap_t pmap, vm_page_t m) { pv_entry_t pv; int loops = 0; boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_page_exists_quick: page %p is not managed", m)); rv = FALSE; rw_wlock(&pvh_global_lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } rw_wunlock(&pvh_global_lock); return (rv); } /* * Remove all pages from specified address space * this aids process exit speeds. Also, this code * is special cased for current process only, but * can have the more generic (and slightly slower) * mode enabled. This is much faster than pmap_remove * in the case of running down an entire address space. */ void pmap_remove_pages(pmap_t pmap) { pd_entry_t *pde; pt_entry_t *pte, tpte; pv_entry_t pv; vm_page_t m; struct pv_chunk *pc, *npc; u_long inuse, bitmask; int allfree, bit, field, idx; if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)) { printf("warning: pmap_remove_pages called with non-current pmap\n"); return; } rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { allfree = 1; for (field = 0; field < _NPCM; field++) { inuse = ~pc->pc_map[field] & pc_freemask[field]; while (inuse != 0) { bit = ffsl(inuse) - 1; bitmask = 1UL << bit; idx = field * sizeof(inuse) * NBBY + bit; pv = &pc->pc_pventry[idx]; inuse &= ~bitmask; pde = pmap_pde(pmap, pv->pv_va); KASSERT(pde != NULL && *pde != 0, ("pmap_remove_pages: pde")); pte = pmap_pde_to_pte(pde, pv->pv_va); if (!pte_test(pte, PTE_V)) panic("pmap_remove_pages: bad pte"); tpte = *pte; /* * We cannot remove wired pages from a process' mapping at this time */ if (pte_test(&tpte, PTE_W)) { allfree = 0; continue; } *pte = is_kernel_pmap(pmap) ? PTE_G : 0; m = PHYS_TO_VM_PAGE(TLBLO_PTE_TO_PA(tpte)); KASSERT(m != NULL, ("pmap_remove_pages: bad tpte %#jx", (uintmax_t)tpte)); /* * Update the vm_page_t clean and reference bits. */ if (pte_test(&tpte, PTE_D)) vm_page_dirty(m); /* Mark free */ PV_STAT(pv_entry_frees++); PV_STAT(pv_entry_spare++); pv_entry_count--; pc->pc_map[field] |= bitmask; pmap->pm_stats.resident_count--; TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); if (TAILQ_EMPTY(&m->md.pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); pmap_unuse_pt(pmap, pv->pv_va, *pde); } } if (allfree) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } } pmap_invalidate_all(pmap); PMAP_UNLOCK(pmap); rw_wunlock(&pvh_global_lock); } /* * pmap_testbit tests bits in pte's */ static boolean_t pmap_testbit(vm_page_t m, int bit) { pv_entry_t pv; pmap_t pmap; pt_entry_t *pte; boolean_t rv = FALSE; if (m->oflags & VPO_UNMANAGED) return (rv); rw_assert(&pvh_global_lock, RA_WLOCKED); TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte(pmap, pv->pv_va); rv = pte_test(pte, bit); PMAP_UNLOCK(pmap); if (rv) break; } return (rv); } /* * pmap_page_wired_mappings: * * Return the number of managed mappings to the given physical page * that are wired. */ int pmap_page_wired_mappings(vm_page_t m) { pv_entry_t pv; pmap_t pmap; pt_entry_t *pte; int count; count = 0; if ((m->oflags & VPO_UNMANAGED) != 0) return (count); rw_wlock(&pvh_global_lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte(pmap, pv->pv_va); if (pte_test(pte, PTE_W)) count++; PMAP_UNLOCK(pmap); } rw_wunlock(&pvh_global_lock); return (count); } /* * Clear the write and modified bits in each of the given page's mappings. */ void pmap_remove_write(vm_page_t m) { pmap_t pmap; pt_entry_t pbits, *pte; pv_entry_t pv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_write: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * set by another thread while the object is locked. Thus, * if PGA_WRITEABLE is clear, no page table entries need updating. */ VM_OBJECT_ASSERT_WLOCKED(m->object); if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return; rw_wlock(&pvh_global_lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte(pmap, pv->pv_va); KASSERT(pte != NULL && pte_test(pte, PTE_V), ("page on pv_list has no pte")); pbits = *pte; if (pte_test(&pbits, PTE_D)) { pte_clear(&pbits, PTE_D); vm_page_dirty(m); } pte_set(&pbits, PTE_RO); if (pbits != *pte) { *pte = pbits; pmap_update_page(pmap, pv->pv_va, pbits); } PMAP_UNLOCK(pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); rw_wunlock(&pvh_global_lock); } /* * pmap_ts_referenced: * * Return the count of reference bits for a page, clearing all of them. */ int pmap_ts_referenced(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_ts_referenced: page %p is not managed", m)); if (m->md.pv_flags & PV_TABLE_REF) { rw_wlock(&pvh_global_lock); m->md.pv_flags &= ~PV_TABLE_REF; rw_wunlock(&pvh_global_lock); return (1); } return (0); } /* * pmap_is_modified: * * Return whether or not the specified physical page was modified * in any physical maps. */ boolean_t pmap_is_modified(vm_page_t m) { boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_modified: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * concurrently set while the object is locked. Thus, if PGA_WRITEABLE * is clear, no PTEs can have PTE_D set. */ VM_OBJECT_ASSERT_WLOCKED(m->object); if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return (FALSE); rw_wlock(&pvh_global_lock); rv = pmap_testbit(m, PTE_D); rw_wunlock(&pvh_global_lock); return (rv); } /* N/C */ /* * pmap_is_prefaultable: * * Return whether or not the specified virtual address is elgible * for prefault. */ boolean_t pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) { pd_entry_t *pde; pt_entry_t *pte; boolean_t rv; rv = FALSE; PMAP_LOCK(pmap); pde = pmap_pde(pmap, addr); if (pde != NULL && *pde != 0) { pte = pmap_pde_to_pte(pde, addr); rv = (*pte == 0); } PMAP_UNLOCK(pmap); return (rv); } /* * Apply the given advice to the specified range of addresses within the * given pmap. Depending on the advice, clear the referenced and/or * modified flags in each mapping and set the mapped page's dirty field. */ void pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) { pd_entry_t *pde, *pdpe; pt_entry_t *pte; vm_offset_t va, va_next; vm_paddr_t pa; vm_page_t m; if (advice != MADV_DONTNEED && advice != MADV_FREE) return; rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { pdpe = pmap_segmap(pmap, sva); #ifdef __mips_n64 if (*pdpe == 0) { va_next = (sva + NBSEG) & ~SEGMASK; if (va_next < sva) va_next = eva; continue; } #endif va_next = (sva + NBPDR) & ~PDRMASK; if (va_next < sva) va_next = eva; pde = pmap_pdpe_to_pde(pdpe, sva); if (*pde == NULL) continue; /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being write protected. */ if (va_next > eva) va_next = eva; va = va_next; for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, sva += PAGE_SIZE) { if (!pte_test(pte, PTE_MANAGED | PTE_V)) { if (va != va_next) { pmap_invalidate_range(pmap, va, sva); va = va_next; } continue; } pa = TLBLO_PTE_TO_PA(*pte); m = PHYS_TO_VM_PAGE(pa); m->md.pv_flags &= ~PV_TABLE_REF; if (pte_test(pte, PTE_D)) { if (advice == MADV_DONTNEED) { /* * Future calls to pmap_is_modified() * can be avoided by making the page * dirty now. */ vm_page_dirty(m); } else { pte_clear(pte, PTE_D); if (va == va_next) va = sva; } } else { /* * Unless PTE_D is set, any TLB entries * mapping "sva" don't allow write access, so * they needn't be invalidated. */ if (va != va_next) { pmap_invalidate_range(pmap, va, sva); va = va_next; } } } if (va != va_next) pmap_invalidate_range(pmap, va, sva); } rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } /* * Clear the modify bits on the specified physical page. */ void pmap_clear_modify(vm_page_t m) { pmap_t pmap; pt_entry_t *pte; pv_entry_t pv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_clear_modify: page %p is not managed", m)); VM_OBJECT_ASSERT_WLOCKED(m->object); KASSERT(!vm_page_xbusied(m), ("pmap_clear_modify: page %p is exclusive busied", m)); /* * If the page is not PGA_WRITEABLE, then no PTEs can have PTE_D set. * If the object containing the page is locked and the page is not * write busied, then PGA_WRITEABLE cannot be concurrently set. */ if ((m->aflags & PGA_WRITEABLE) == 0) return; rw_wlock(&pvh_global_lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte(pmap, pv->pv_va); if (pte_test(pte, PTE_D)) { pte_clear(pte, PTE_D); pmap_update_page(pmap, pv->pv_va, *pte); } PMAP_UNLOCK(pmap); } rw_wunlock(&pvh_global_lock); } /* * pmap_is_referenced: * * Return whether or not the specified physical page was referenced * in any physical maps. */ boolean_t pmap_is_referenced(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_referenced: page %p is not managed", m)); return ((m->md.pv_flags & PV_TABLE_REF) != 0); } /* * Miscellaneous support routines follow */ /* * Map a set of physical memory pages into the kernel virtual * address space. Return a pointer to where it is mapped. This * routine is intended to be used for mapping device memory, * NOT real memory. * * Use XKPHYS uncached for 64 bit, and KSEG1 where possible for 32 bit. */ void * pmap_mapdev(vm_paddr_t pa, vm_size_t size) { vm_offset_t va, tmpva, offset; /* * KSEG1 maps only first 512M of phys address space. For * pa > 0x20000000 we should make proper mapping * using pmap_kenter. */ if (MIPS_DIRECT_MAPPABLE(pa + size - 1)) return ((void *)MIPS_PHYS_TO_DIRECT_UNCACHED(pa)); else { offset = pa & PAGE_MASK; size = roundup(size + offset, PAGE_SIZE); va = kva_alloc(size); if (!va) panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); pa = trunc_page(pa); for (tmpva = va; size > 0;) { pmap_kenter_attr(tmpva, pa, PTE_C_UNCACHED); size -= PAGE_SIZE; tmpva += PAGE_SIZE; pa += PAGE_SIZE; } } return ((void *)(va + offset)); } void pmap_unmapdev(vm_offset_t va, vm_size_t size) { #ifndef __mips_n64 vm_offset_t base, offset; /* If the address is within KSEG1 then there is nothing to do */ if (va >= MIPS_KSEG1_START && va <= MIPS_KSEG1_END) return; base = trunc_page(va); offset = va & PAGE_MASK; size = roundup(size + offset, PAGE_SIZE); kva_free(base, size); #endif } /* * perform the pmap work for mincore */ int pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) { pt_entry_t *ptep, pte; vm_paddr_t pa; vm_page_t m; int val; PMAP_LOCK(pmap); retry: ptep = pmap_pte(pmap, addr); pte = (ptep != NULL) ? *ptep : 0; if (!pte_test(&pte, PTE_V)) { val = 0; goto out; } val = MINCORE_INCORE; if (pte_test(&pte, PTE_D)) val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; pa = TLBLO_PTE_TO_PA(pte); if (pte_test(&pte, PTE_MANAGED)) { /* * This may falsely report the given address as * MINCORE_REFERENCED. Unfortunately, due to the lack of * per-PTE reference information, it is impossible to * determine if the address is MINCORE_REFERENCED. */ m = PHYS_TO_VM_PAGE(pa); if ((m->aflags & PGA_REFERENCED) != 0) val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; } if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && pte_test(&pte, PTE_MANAGED)) { /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */ if (vm_page_pa_tryrelock(pmap, pa, locked_pa)) goto retry; } else out: PA_UNLOCK_COND(*locked_pa); PMAP_UNLOCK(pmap); return (val); } void pmap_activate(struct thread *td) { pmap_t pmap, oldpmap; struct proc *p = td->td_proc; u_int cpuid; critical_enter(); pmap = vmspace_pmap(p->p_vmspace); oldpmap = PCPU_GET(curpmap); cpuid = PCPU_GET(cpuid); if (oldpmap) CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); CPU_SET_ATOMIC(cpuid, &pmap->pm_active); pmap_asid_alloc(pmap); if (td == curthread) { PCPU_SET(segbase, pmap->pm_segtab); mips_wr_entryhi(pmap->pm_asid[cpuid].asid); } PCPU_SET(curpmap, pmap); critical_exit(); } void pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) { } /* * Increase the starting virtual address of the given mapping if a * different alignment might result in more superpage mappings. */ void pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr, vm_size_t size) { vm_offset_t superpage_offset; if (size < NBSEG) return; if (object != NULL && (object->flags & OBJ_COLORED) != 0) offset += ptoa(object->pg_color); superpage_offset = offset & SEGMASK; if (size - ((NBSEG - superpage_offset) & SEGMASK) < NBSEG || (*addr & SEGMASK) == superpage_offset) return; if ((*addr & SEGMASK) < superpage_offset) *addr = (*addr & ~SEGMASK) + superpage_offset; else *addr = ((*addr + SEGMASK) & ~SEGMASK) + superpage_offset; } #ifdef DDB DB_SHOW_COMMAND(ptable, ddb_pid_dump) { pmap_t pmap; struct thread *td = NULL; struct proc *p; int i, j, k; vm_paddr_t pa; vm_offset_t va; if (have_addr) { td = db_lookup_thread(addr, true); if (td == NULL) { db_printf("Invalid pid or tid"); return; } p = td->td_proc; if (p->p_vmspace == NULL) { db_printf("No vmspace for process"); return; } pmap = vmspace_pmap(p->p_vmspace); } else pmap = kernel_pmap; db_printf("pmap:%p segtab:%p asid:%x generation:%x\n", pmap, pmap->pm_segtab, pmap->pm_asid[0].asid, pmap->pm_asid[0].gen); for (i = 0; i < NPDEPG; i++) { pd_entry_t *pdpe; pt_entry_t *pde; pt_entry_t pte; pdpe = (pd_entry_t *)pmap->pm_segtab[i]; if (pdpe == NULL) continue; db_printf("[%4d] %p\n", i, pdpe); #ifdef __mips_n64 for (j = 0; j < NPDEPG; j++) { pde = (pt_entry_t *)pdpe[j]; if (pde == NULL) continue; db_printf("\t[%4d] %p\n", j, pde); #else { j = 0; pde = (pt_entry_t *)pdpe; #endif for (k = 0; k < NPTEPG; k++) { pte = pde[k]; if (pte == 0 || !pte_test(&pte, PTE_V)) continue; pa = TLBLO_PTE_TO_PA(pte); va = ((u_long)i << SEGSHIFT) | (j << PDRSHIFT) | (k << PAGE_SHIFT); db_printf("\t\t[%04d] va: %p pte: %8jx pa:%jx\n", k, (void *)va, (uintmax_t)pte, (uintmax_t)pa); } } } } #endif /* * Allocate TLB address space tag (called ASID or TLBPID) and return it. * It takes almost as much or more time to search the TLB for a * specific ASID and flush those entries as it does to flush the entire TLB. * Therefore, when we allocate a new ASID, we just take the next number. When * we run out of numbers, we flush the TLB, increment the generation count * and start over. ASID zero is reserved for kernel use. */ static void pmap_asid_alloc(pmap) pmap_t pmap; { if (pmap->pm_asid[PCPU_GET(cpuid)].asid != PMAP_ASID_RESERVED && pmap->pm_asid[PCPU_GET(cpuid)].gen == PCPU_GET(asid_generation)); else { if (PCPU_GET(next_asid) == pmap_max_asid) { tlb_invalidate_all_user(NULL); PCPU_SET(asid_generation, (PCPU_GET(asid_generation) + 1) & ASIDGEN_MASK); if (PCPU_GET(asid_generation) == 0) { PCPU_SET(asid_generation, 1); } PCPU_SET(next_asid, 1); /* 0 means invalid */ } pmap->pm_asid[PCPU_GET(cpuid)].asid = PCPU_GET(next_asid); pmap->pm_asid[PCPU_GET(cpuid)].gen = PCPU_GET(asid_generation); PCPU_SET(next_asid, PCPU_GET(next_asid) + 1); } } static pt_entry_t init_pte_prot(vm_page_t m, vm_prot_t access, vm_prot_t prot) { pt_entry_t rw; if (!(prot & VM_PROT_WRITE)) rw = PTE_V | PTE_RO; else if ((m->oflags & VPO_UNMANAGED) == 0) { if ((access & VM_PROT_WRITE) != 0) rw = PTE_V | PTE_D; else rw = PTE_V; } else /* Needn't emulate a modified bit for unmanaged pages. */ rw = PTE_V | PTE_D; return (rw); } /* * pmap_emulate_modified : do dirty bit emulation * * On SMP, update just the local TLB, other CPUs will update their * TLBs from PTE lazily, if they get the exception. * Returns 0 in case of sucess, 1 if the page is read only and we * need to fault. */ int pmap_emulate_modified(pmap_t pmap, vm_offset_t va) { pt_entry_t *pte; PMAP_LOCK(pmap); pte = pmap_pte(pmap, va); if (pte == NULL) panic("pmap_emulate_modified: can't find PTE"); #ifdef SMP /* It is possible that some other CPU changed m-bit */ if (!pte_test(pte, PTE_V) || pte_test(pte, PTE_D)) { tlb_update(pmap, va, *pte); PMAP_UNLOCK(pmap); return (0); } #else if (!pte_test(pte, PTE_V) || pte_test(pte, PTE_D)) panic("pmap_emulate_modified: invalid pte"); #endif if (pte_test(pte, PTE_RO)) { PMAP_UNLOCK(pmap); return (1); } pte_set(pte, PTE_D); tlb_update(pmap, va, *pte); if (!pte_test(pte, PTE_MANAGED)) panic("pmap_emulate_modified: unmanaged page"); PMAP_UNLOCK(pmap); return (0); } /* * Routine: pmap_kextract * Function: * Extract the physical page address associated * virtual address. */ vm_paddr_t pmap_kextract(vm_offset_t va) { int mapped; /* * First, the direct-mapped regions. */ #if defined(__mips_n64) if (va >= MIPS_XKPHYS_START && va < MIPS_XKPHYS_END) return (MIPS_XKPHYS_TO_PHYS(va)); #endif if (va >= MIPS_KSEG0_START && va < MIPS_KSEG0_END) return (MIPS_KSEG0_TO_PHYS(va)); if (va >= MIPS_KSEG1_START && va < MIPS_KSEG1_END) return (MIPS_KSEG1_TO_PHYS(va)); /* * User virtual addresses. */ if (va < VM_MAXUSER_ADDRESS) { pt_entry_t *ptep; if (curproc && curproc->p_vmspace) { ptep = pmap_pte(&curproc->p_vmspace->vm_pmap, va); if (ptep) { return (TLBLO_PTE_TO_PA(*ptep) | (va & PAGE_MASK)); } return (0); } } /* * Should be kernel virtual here, otherwise fail */ mapped = (va >= MIPS_KSEG2_START || va < MIPS_KSEG2_END); #if defined(__mips_n64) mapped = mapped || (va >= MIPS_XKSEG_START || va < MIPS_XKSEG_END); #endif /* * Kernel virtual. */ if (mapped) { pt_entry_t *ptep; /* Is the kernel pmap initialized? */ if (!CPU_EMPTY(&kernel_pmap->pm_active)) { /* It's inside the virtual address range */ ptep = pmap_pte(kernel_pmap, va); if (ptep) { return (TLBLO_PTE_TO_PA(*ptep) | (va & PAGE_MASK)); } } return (0); } panic("%s for unknown address space %p.", __func__, (void *)va); } void pmap_flush_pvcache(vm_page_t m) { pv_entry_t pv; if (m != NULL) { for (pv = TAILQ_FIRST(&m->md.pv_list); pv; pv = TAILQ_NEXT(pv, pv_list)) { mips_dcache_wbinv_range_index(pv->pv_va, PAGE_SIZE); } } } void pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) { /* * It appears that this function can only be called before any mappings * for the page are established. If this ever changes, this code will * need to walk the pv_list and make each of the existing mappings * uncacheable, being careful to sync caches and PTEs (and maybe * invalidate TLB?) for any current mapping it modifies. */ if (TAILQ_FIRST(&m->md.pv_list) != NULL) panic("Can't change memattr on page with existing mappings"); /* * The only memattr we support is UNCACHEABLE, translate the (semi-)MI * representation of that into our internal flag in the page MD struct. */ if (ma == VM_MEMATTR_UNCACHEABLE) m->md.pv_flags |= PV_MEMATTR_UNCACHEABLE; else m->md.pv_flags &= ~PV_MEMATTR_UNCACHEABLE; } Index: head/sys/mips/mips/uma_machdep.c =================================================================== --- head/sys/mips/mips/uma_machdep.c (revision 292468) +++ head/sys/mips/mips/uma_machdep.c (revision 292469) @@ -1,83 +1,88 @@ /*- * Copyright (c) 2003 Alan L. Cox * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include void * uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait) { vm_paddr_t pa; vm_page_t m; int pflags; void *va; *flags = UMA_SLAB_PRIV; pflags = malloc2vm_flags(wait) | VM_ALLOC_WIRED; for (;;) { m = vm_page_alloc_freelist(VM_FREELIST_DIRECT, pflags); +#ifndef __mips_n64 + if (m == NULL && vm_page_reclaim_contig(pflags, 1, + 0, MIPS_KSEG0_LARGEST_PHYS, PAGE_SIZE, 0)) + continue; +#endif if (m == NULL) { if (wait & M_NOWAIT) return (NULL); else - pmap_grow_direct_page_cache(); + VM_WAIT; } else break; } pa = VM_PAGE_TO_PHYS(m); va = (void *)MIPS_PHYS_TO_DIRECT(pa); if ((wait & M_ZERO) && (m->flags & PG_ZERO) == 0) bzero(va, PAGE_SIZE); return (va); } void uma_small_free(void *mem, vm_size_t size, u_int8_t flags) { vm_page_t m; vm_paddr_t pa; pa = MIPS_DIRECT_TO_PHYS((vm_offset_t)mem); m = PHYS_TO_VM_PAGE(pa); m->wire_count--; vm_page_free(m); atomic_subtract_int(&vm_cnt.v_wire_count, 1); } Index: head/sys/vm/vm_kern.c =================================================================== --- head/sys/vm/vm_kern.c (revision 292468) +++ head/sys/vm/vm_kern.c (revision 292469) @@ -1,552 +1,559 @@ /*- * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_kern.c 8.3 (Berkeley) 1/12/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * Kernel memory management. */ #include __FBSDID("$FreeBSD$"); #include #include #include /* for ticks and hz */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include vm_map_t kernel_map; vm_map_t exec_map; vm_map_t pipe_map; const void *zero_region; CTASSERT((ZERO_REGION_SIZE & PAGE_MASK) == 0); /* NB: Used by kernel debuggers. */ const u_long vm_maxuser_address = VM_MAXUSER_ADDRESS; SYSCTL_ULONG(_vm, OID_AUTO, min_kernel_address, CTLFLAG_RD, SYSCTL_NULL_ULONG_PTR, VM_MIN_KERNEL_ADDRESS, "Min kernel address"); SYSCTL_ULONG(_vm, OID_AUTO, max_kernel_address, CTLFLAG_RD, #if defined(__arm__) || defined(__sparc64__) &vm_max_kernel_address, 0, #else SYSCTL_NULL_ULONG_PTR, VM_MAX_KERNEL_ADDRESS, #endif "Max kernel address"); /* * kva_alloc: * * Allocate a virtual address range with no underlying object and * no initial mapping to physical memory. Any mapping from this * range to physical memory must be explicitly created prior to * its use, typically with pmap_qenter(). Any attempt to create * a mapping on demand through vm_fault() will result in a panic. */ vm_offset_t kva_alloc(size) vm_size_t size; { vm_offset_t addr; size = round_page(size); if (vmem_alloc(kernel_arena, size, M_BESTFIT | M_NOWAIT, &addr)) return (0); return (addr); } /* * kva_free: * * Release a region of kernel virtual memory allocated * with kva_alloc, and return the physical pages * associated with that region. * * This routine may not block on kernel maps. */ void kva_free(addr, size) vm_offset_t addr; vm_size_t size; { size = round_page(size); vmem_free(kernel_arena, addr, size); } /* * Allocates a region from the kernel address map and physical pages * within the specified address range to the kernel object. Creates a * wired mapping from this region to these pages, and returns the * region's starting virtual address. The allocated pages are not * necessarily physically contiguous. If M_ZERO is specified through the * given flags, then the pages are zeroed before they are mapped. */ vm_offset_t kmem_alloc_attr(vmem_t *vmem, vm_size_t size, int flags, vm_paddr_t low, vm_paddr_t high, vm_memattr_t memattr) { vm_object_t object = vmem == kmem_arena ? kmem_object : kernel_object; vm_offset_t addr, i; vm_ooffset_t offset; vm_page_t m; int pflags, tries; size = round_page(size); if (vmem_alloc(vmem, size, M_BESTFIT | flags, &addr)) return (0); offset = addr - VM_MIN_KERNEL_ADDRESS; pflags = malloc2vm_flags(flags) | VM_ALLOC_NOBUSY | VM_ALLOC_WIRED; VM_OBJECT_WLOCK(object); for (i = 0; i < size; i += PAGE_SIZE) { tries = 0; retry: m = vm_page_alloc_contig(object, OFF_TO_IDX(offset + i), pflags, 1, low, high, PAGE_SIZE, 0, memattr); if (m == NULL) { VM_OBJECT_WUNLOCK(object); if (tries < ((flags & M_NOWAIT) != 0 ? 1 : 3)) { - vm_pageout_grow_cache(tries, low, high); + if (!vm_page_reclaim_contig(pflags, 1, + low, high, PAGE_SIZE, 0) && + (flags & M_WAITOK) != 0) + VM_WAIT; VM_OBJECT_WLOCK(object); tries++; goto retry; } kmem_unback(object, addr, i); vmem_free(vmem, addr, size); return (0); } if ((flags & M_ZERO) && (m->flags & PG_ZERO) == 0) pmap_zero_page(m); m->valid = VM_PAGE_BITS_ALL; pmap_enter(kernel_pmap, addr + i, m, VM_PROT_ALL, VM_PROT_ALL | PMAP_ENTER_WIRED, 0); } VM_OBJECT_WUNLOCK(object); return (addr); } /* * Allocates a region from the kernel address map and physically * contiguous pages within the specified address range to the kernel * object. Creates a wired mapping from this region to these pages, and * returns the region's starting virtual address. If M_ZERO is specified * through the given flags, then the pages are zeroed before they are * mapped. */ vm_offset_t kmem_alloc_contig(struct vmem *vmem, vm_size_t size, int flags, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, vm_memattr_t memattr) { vm_object_t object = vmem == kmem_arena ? kmem_object : kernel_object; vm_offset_t addr, tmp; vm_ooffset_t offset; vm_page_t end_m, m; + u_long npages; int pflags, tries; size = round_page(size); if (vmem_alloc(vmem, size, flags | M_BESTFIT, &addr)) return (0); offset = addr - VM_MIN_KERNEL_ADDRESS; pflags = malloc2vm_flags(flags) | VM_ALLOC_NOBUSY | VM_ALLOC_WIRED; + npages = atop(size); VM_OBJECT_WLOCK(object); tries = 0; retry: m = vm_page_alloc_contig(object, OFF_TO_IDX(offset), pflags, - atop(size), low, high, alignment, boundary, memattr); + npages, low, high, alignment, boundary, memattr); if (m == NULL) { VM_OBJECT_WUNLOCK(object); if (tries < ((flags & M_NOWAIT) != 0 ? 1 : 3)) { - vm_pageout_grow_cache(tries, low, high); + if (!vm_page_reclaim_contig(pflags, npages, low, high, + alignment, boundary) && (flags & M_WAITOK) != 0) + VM_WAIT; VM_OBJECT_WLOCK(object); tries++; goto retry; } vmem_free(vmem, addr, size); return (0); } - end_m = m + atop(size); + end_m = m + npages; tmp = addr; for (; m < end_m; m++) { if ((flags & M_ZERO) && (m->flags & PG_ZERO) == 0) pmap_zero_page(m); m->valid = VM_PAGE_BITS_ALL; pmap_enter(kernel_pmap, tmp, m, VM_PROT_ALL, VM_PROT_ALL | PMAP_ENTER_WIRED, 0); tmp += PAGE_SIZE; } VM_OBJECT_WUNLOCK(object); return (addr); } /* * kmem_suballoc: * * Allocates a map to manage a subrange * of the kernel virtual address space. * * Arguments are as follows: * * parent Map to take range from * min, max Returned endpoints of map * size Size of range to find * superpage_align Request that min is superpage aligned */ vm_map_t kmem_suballoc(vm_map_t parent, vm_offset_t *min, vm_offset_t *max, vm_size_t size, boolean_t superpage_align) { int ret; vm_map_t result; size = round_page(size); *min = vm_map_min(parent); ret = vm_map_find(parent, NULL, 0, min, size, 0, superpage_align ? VMFS_SUPER_SPACE : VMFS_ANY_SPACE, VM_PROT_ALL, VM_PROT_ALL, MAP_ACC_NO_CHARGE); if (ret != KERN_SUCCESS) panic("kmem_suballoc: bad status return of %d", ret); *max = *min + size; result = vm_map_create(vm_map_pmap(parent), *min, *max); if (result == NULL) panic("kmem_suballoc: cannot create submap"); if (vm_map_submap(parent, *min, *max, result) != KERN_SUCCESS) panic("kmem_suballoc: unable to change range to submap"); return (result); } /* * kmem_malloc: * * Allocate wired-down pages in the kernel's address space. */ vm_offset_t kmem_malloc(struct vmem *vmem, vm_size_t size, int flags) { vm_offset_t addr; int rv; size = round_page(size); if (vmem_alloc(vmem, size, flags | M_BESTFIT, &addr)) return (0); rv = kmem_back((vmem == kmem_arena) ? kmem_object : kernel_object, addr, size, flags); if (rv != KERN_SUCCESS) { vmem_free(vmem, addr, size); return (0); } return (addr); } /* * kmem_back: * * Allocate physical pages for the specified virtual address range. */ int kmem_back(vm_object_t object, vm_offset_t addr, vm_size_t size, int flags) { vm_offset_t offset, i; vm_page_t m; int pflags; KASSERT(object == kmem_object || object == kernel_object, ("kmem_back: only supports kernel objects.")); offset = addr - VM_MIN_KERNEL_ADDRESS; pflags = malloc2vm_flags(flags) | VM_ALLOC_NOBUSY | VM_ALLOC_WIRED; VM_OBJECT_WLOCK(object); for (i = 0; i < size; i += PAGE_SIZE) { retry: m = vm_page_alloc(object, OFF_TO_IDX(offset + i), pflags); /* * Ran out of space, free everything up and return. Don't need * to lock page queues here as we know that the pages we got * aren't on any queues. */ if (m == NULL) { VM_OBJECT_WUNLOCK(object); if ((flags & M_NOWAIT) == 0) { VM_WAIT; VM_OBJECT_WLOCK(object); goto retry; } kmem_unback(object, addr, i); return (KERN_NO_SPACE); } if (flags & M_ZERO && (m->flags & PG_ZERO) == 0) pmap_zero_page(m); KASSERT((m->oflags & VPO_UNMANAGED) != 0, ("kmem_malloc: page %p is managed", m)); m->valid = VM_PAGE_BITS_ALL; pmap_enter(kernel_pmap, addr + i, m, VM_PROT_ALL, VM_PROT_ALL | PMAP_ENTER_WIRED, 0); } VM_OBJECT_WUNLOCK(object); return (KERN_SUCCESS); } /* * kmem_unback: * * Unmap and free the physical pages underlying the specified virtual * address range. * * A physical page must exist within the specified object at each index * that is being unmapped. */ void kmem_unback(vm_object_t object, vm_offset_t addr, vm_size_t size) { vm_page_t m; vm_offset_t i, offset; KASSERT(object == kmem_object || object == kernel_object, ("kmem_unback: only supports kernel objects.")); pmap_remove(kernel_pmap, addr, addr + size); offset = addr - VM_MIN_KERNEL_ADDRESS; VM_OBJECT_WLOCK(object); for (i = 0; i < size; i += PAGE_SIZE) { m = vm_page_lookup(object, OFF_TO_IDX(offset + i)); vm_page_unwire(m, PQ_NONE); vm_page_free(m); } VM_OBJECT_WUNLOCK(object); } /* * kmem_free: * * Free memory allocated with kmem_malloc. The size must match the * original allocation. */ void kmem_free(struct vmem *vmem, vm_offset_t addr, vm_size_t size) { size = round_page(size); kmem_unback((vmem == kmem_arena) ? kmem_object : kernel_object, addr, size); vmem_free(vmem, addr, size); } /* * kmap_alloc_wait: * * Allocates pageable memory from a sub-map of the kernel. If the submap * has no room, the caller sleeps waiting for more memory in the submap. * * This routine may block. */ vm_offset_t kmap_alloc_wait(map, size) vm_map_t map; vm_size_t size; { vm_offset_t addr; size = round_page(size); if (!swap_reserve(size)) return (0); for (;;) { /* * To make this work for more than one map, use the map's lock * to lock out sleepers/wakers. */ vm_map_lock(map); if (vm_map_findspace(map, vm_map_min(map), size, &addr) == 0) break; /* no space now; see if we can ever get space */ if (vm_map_max(map) - vm_map_min(map) < size) { vm_map_unlock(map); swap_release(size); return (0); } map->needs_wakeup = TRUE; vm_map_unlock_and_wait(map, 0); } vm_map_insert(map, NULL, 0, addr, addr + size, VM_PROT_ALL, VM_PROT_ALL, MAP_ACC_CHARGED); vm_map_unlock(map); return (addr); } /* * kmap_free_wakeup: * * Returns memory to a submap of the kernel, and wakes up any processes * waiting for memory in that map. */ void kmap_free_wakeup(map, addr, size) vm_map_t map; vm_offset_t addr; vm_size_t size; { vm_map_lock(map); (void) vm_map_delete(map, trunc_page(addr), round_page(addr + size)); if (map->needs_wakeup) { map->needs_wakeup = FALSE; vm_map_wakeup(map); } vm_map_unlock(map); } void kmem_init_zero_region(void) { vm_offset_t addr, i; vm_page_t m; /* * Map a single physical page of zeros to a larger virtual range. * This requires less looping in places that want large amounts of * zeros, while not using much more physical resources. */ addr = kva_alloc(ZERO_REGION_SIZE); m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if ((m->flags & PG_ZERO) == 0) pmap_zero_page(m); for (i = 0; i < ZERO_REGION_SIZE; i += PAGE_SIZE) pmap_qenter(addr + i, &m, 1); pmap_protect(kernel_pmap, addr, addr + ZERO_REGION_SIZE, VM_PROT_READ); zero_region = (const void *)addr; } /* * kmem_init: * * Create the kernel map; insert a mapping covering kernel text, * data, bss, and all space allocated thus far (`boostrap' data). The * new map will thus map the range between VM_MIN_KERNEL_ADDRESS and * `start' as allocated, and the range between `start' and `end' as free. */ void kmem_init(start, end) vm_offset_t start, end; { vm_map_t m; m = vm_map_create(kernel_pmap, VM_MIN_KERNEL_ADDRESS, end); m->system_map = 1; vm_map_lock(m); /* N.B.: cannot use kgdb to debug, starting with this assignment ... */ kernel_map = m; (void) vm_map_insert(m, NULL, (vm_ooffset_t) 0, #ifdef __amd64__ KERNBASE, #else VM_MIN_KERNEL_ADDRESS, #endif start, VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT); /* ... and ending with the completion of the above `insert' */ vm_map_unlock(m); } #ifdef DIAGNOSTIC /* * Allow userspace to directly trigger the VM drain routine for testing * purposes. */ static int debug_vm_lowmem(SYSCTL_HANDLER_ARGS) { int error, i; i = 0; error = sysctl_handle_int(oidp, &i, 0, req); if (error) return (error); if (i) EVENTHANDLER_INVOKE(vm_lowmem, 0); return (0); } SYSCTL_PROC(_debug, OID_AUTO, vm_lowmem, CTLTYPE_INT | CTLFLAG_RW, 0, 0, debug_vm_lowmem, "I", "set to trigger vm_lowmem event"); #endif Index: head/sys/vm/vm_page.c =================================================================== --- head/sys/vm/vm_page.c (revision 292468) +++ head/sys/vm/vm_page.c (revision 292469) @@ -1,3343 +1,3932 @@ /*- * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1998 Matthew Dillon. All Rights Reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_page.c 7.4 (Berkeley) 5/7/91 */ /*- * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * GENERAL RULES ON VM_PAGE MANIPULATION * * - A page queue lock is required when adding or removing a page from a * page queue regardless of other locks or the busy state of a page. * * * In general, no thread besides the page daemon can acquire or * hold more than one page queue lock at a time. * * * The page daemon can acquire and hold any pair of page queue * locks in any order. * * - The object lock is required when inserting or removing * pages from an object (vm_page_insert() or vm_page_remove()). * */ /* * Resident memory management module. */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Associated with page of user-allocatable memory is a * page structure. */ struct vm_domain vm_dom[MAXMEMDOM]; struct mtx_padalign vm_page_queue_free_mtx; struct mtx_padalign pa_lock[PA_LOCK_COUNT]; vm_page_t vm_page_array; long vm_page_array_size; long first_page; int vm_page_zero_count; static int boot_pages = UMA_BOOT_PAGES; SYSCTL_INT(_vm, OID_AUTO, boot_pages, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &boot_pages, 0, "number of pages allocated for bootstrapping the VM system"); static int pa_tryrelock_restart; SYSCTL_INT(_vm, OID_AUTO, tryrelock_restart, CTLFLAG_RD, &pa_tryrelock_restart, 0, "Number of tryrelock restarts"); static TAILQ_HEAD(, vm_page) blacklist_head; static int sysctl_vm_page_blacklist(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_vm, OID_AUTO, page_blacklist, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_page_blacklist, "A", "Blacklist pages"); /* Is the page daemon waiting for free pages? */ static int vm_pageout_pages_needed; static uma_zone_t fakepg_zone; static struct vnode *vm_page_alloc_init(vm_page_t m); static void vm_page_cache_turn_free(vm_page_t m); static void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits); static void vm_page_enqueue(uint8_t queue, vm_page_t m); +static void vm_page_free_wakeup(void); static void vm_page_init_fakepg(void *dummy); static int vm_page_insert_after(vm_page_t m, vm_object_t object, vm_pindex_t pindex, vm_page_t mpred); static void vm_page_insert_radixdone(vm_page_t m, vm_object_t object, vm_page_t mpred); +static int vm_page_reclaim_run(int req_class, u_long npages, vm_page_t m_run, + vm_paddr_t high); SYSINIT(vm_page, SI_SUB_VM, SI_ORDER_SECOND, vm_page_init_fakepg, NULL); static void vm_page_init_fakepg(void *dummy) { fakepg_zone = uma_zcreate("fakepg", sizeof(struct vm_page), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE | UMA_ZONE_VM); } /* Make sure that u_long is at least 64 bits when PAGE_SIZE is 32K. */ #if PAGE_SIZE == 32768 #ifdef CTASSERT CTASSERT(sizeof(u_long) >= 8); #endif #endif /* * Try to acquire a physical address lock while a pmap is locked. If we * fail to trylock we unlock and lock the pmap directly and cache the * locked pa in *locked. The caller should then restart their loop in case * the virtual to physical mapping has changed. */ int vm_page_pa_tryrelock(pmap_t pmap, vm_paddr_t pa, vm_paddr_t *locked) { vm_paddr_t lockpa; lockpa = *locked; *locked = pa; if (lockpa) { PA_LOCK_ASSERT(lockpa, MA_OWNED); if (PA_LOCKPTR(pa) == PA_LOCKPTR(lockpa)) return (0); PA_UNLOCK(lockpa); } if (PA_TRYLOCK(pa)) return (0); PMAP_UNLOCK(pmap); atomic_add_int(&pa_tryrelock_restart, 1); PA_LOCK(pa); PMAP_LOCK(pmap); return (EAGAIN); } /* * vm_set_page_size: * * Sets the page size, perhaps based upon the memory * size. Must be called before any use of page-size * dependent functions. */ void vm_set_page_size(void) { if (vm_cnt.v_page_size == 0) vm_cnt.v_page_size = PAGE_SIZE; if (((vm_cnt.v_page_size - 1) & vm_cnt.v_page_size) != 0) panic("vm_set_page_size: page size not a power of two"); } /* * vm_page_blacklist_next: * * Find the next entry in the provided string of blacklist * addresses. Entries are separated by space, comma, or newline. * If an invalid integer is encountered then the rest of the * string is skipped. Updates the list pointer to the next * character, or NULL if the string is exhausted or invalid. */ static vm_paddr_t vm_page_blacklist_next(char **list, char *end) { vm_paddr_t bad; char *cp, *pos; if (list == NULL || *list == NULL) return (0); if (**list =='\0') { *list = NULL; return (0); } /* * If there's no end pointer then the buffer is coming from * the kenv and we know it's null-terminated. */ if (end == NULL) end = *list + strlen(*list); /* Ensure that strtoq() won't walk off the end */ if (*end != '\0') { if (*end == '\n' || *end == ' ' || *end == ',') *end = '\0'; else { printf("Blacklist not terminated, skipping\n"); *list = NULL; return (0); } } for (pos = *list; *pos != '\0'; pos = cp) { bad = strtoq(pos, &cp, 0); if (*cp == '\0' || *cp == ' ' || *cp == ',' || *cp == '\n') { if (bad == 0) { if (++cp < end) continue; else break; } } else break; if (*cp == '\0' || ++cp >= end) *list = NULL; else *list = cp; return (trunc_page(bad)); } printf("Garbage in RAM blacklist, skipping\n"); *list = NULL; return (0); } /* * vm_page_blacklist_check: * * Iterate through the provided string of blacklist addresses, pulling * each entry out of the physical allocator free list and putting it * onto a list for reporting via the vm.page_blacklist sysctl. */ static void vm_page_blacklist_check(char *list, char *end) { vm_paddr_t pa; vm_page_t m; char *next; int ret; next = list; while (next != NULL) { if ((pa = vm_page_blacklist_next(&next, end)) == 0) continue; m = vm_phys_paddr_to_vm_page(pa); if (m == NULL) continue; mtx_lock(&vm_page_queue_free_mtx); ret = vm_phys_unfree_page(m); mtx_unlock(&vm_page_queue_free_mtx); if (ret == TRUE) { TAILQ_INSERT_TAIL(&blacklist_head, m, listq); if (bootverbose) printf("Skipping page with pa 0x%jx\n", (uintmax_t)pa); } } } /* * vm_page_blacklist_load: * * Search for a special module named "ram_blacklist". It'll be a * plain text file provided by the user via the loader directive * of the same name. */ static void vm_page_blacklist_load(char **list, char **end) { void *mod; u_char *ptr; u_int len; mod = NULL; ptr = NULL; mod = preload_search_by_type("ram_blacklist"); if (mod != NULL) { ptr = preload_fetch_addr(mod); len = preload_fetch_size(mod); } *list = ptr; if (ptr != NULL) *end = ptr + len; else *end = NULL; return; } static int sysctl_vm_page_blacklist(SYSCTL_HANDLER_ARGS) { vm_page_t m; struct sbuf sbuf; int error, first; first = 1; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sbuf_new_for_sysctl(&sbuf, NULL, 128, req); TAILQ_FOREACH(m, &blacklist_head, listq) { sbuf_printf(&sbuf, "%s%#jx", first ? "" : ",", (uintmax_t)m->phys_addr); first = 0; } error = sbuf_finish(&sbuf); sbuf_delete(&sbuf); return (error); } static void vm_page_domain_init(struct vm_domain *vmd) { struct vm_pagequeue *pq; int i; *__DECONST(char **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_name) = "vm inactive pagequeue"; *__DECONST(int **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_vcnt) = &vm_cnt.v_inactive_count; *__DECONST(char **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_name) = "vm active pagequeue"; *__DECONST(int **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_vcnt) = &vm_cnt.v_active_count; vmd->vmd_page_count = 0; vmd->vmd_free_count = 0; vmd->vmd_segs = 0; vmd->vmd_oom = FALSE; vmd->vmd_pass = 0; for (i = 0; i < PQ_COUNT; i++) { pq = &vmd->vmd_pagequeues[i]; TAILQ_INIT(&pq->pq_pl); mtx_init(&pq->pq_mutex, pq->pq_name, "vm pagequeue", MTX_DEF | MTX_DUPOK); } } /* * vm_page_startup: * * Initializes the resident memory module. * * Allocates memory for the page cells, and * for the object/offset-to-page hash table headers. * Each page cell is initialized and placed on the free list. */ vm_offset_t vm_page_startup(vm_offset_t vaddr) { vm_offset_t mapped; vm_paddr_t page_range; vm_paddr_t new_end; int i; vm_paddr_t pa; vm_paddr_t last_pa; char *list, *listend; vm_paddr_t end; vm_paddr_t biggestsize; vm_paddr_t low_water, high_water; int biggestone; biggestsize = 0; biggestone = 0; vaddr = round_page(vaddr); for (i = 0; phys_avail[i + 1]; i += 2) { phys_avail[i] = round_page(phys_avail[i]); phys_avail[i + 1] = trunc_page(phys_avail[i + 1]); } low_water = phys_avail[0]; high_water = phys_avail[1]; for (i = 0; i < vm_phys_nsegs; i++) { if (vm_phys_segs[i].start < low_water) low_water = vm_phys_segs[i].start; if (vm_phys_segs[i].end > high_water) high_water = vm_phys_segs[i].end; } for (i = 0; phys_avail[i + 1]; i += 2) { vm_paddr_t size = phys_avail[i + 1] - phys_avail[i]; if (size > biggestsize) { biggestone = i; biggestsize = size; } if (phys_avail[i] < low_water) low_water = phys_avail[i]; if (phys_avail[i + 1] > high_water) high_water = phys_avail[i + 1]; } end = phys_avail[biggestone+1]; /* * Initialize the page and queue locks. */ mtx_init(&vm_page_queue_free_mtx, "vm page free queue", NULL, MTX_DEF); for (i = 0; i < PA_LOCK_COUNT; i++) mtx_init(&pa_lock[i], "vm page", NULL, MTX_DEF); for (i = 0; i < vm_ndomains; i++) vm_page_domain_init(&vm_dom[i]); /* * Allocate memory for use when boot strapping the kernel memory * allocator. * * CTFLAG_RDTUN doesn't work during the early boot process, so we must * manually fetch the value. */ TUNABLE_INT_FETCH("vm.boot_pages", &boot_pages); new_end = end - (boot_pages * UMA_SLAB_SIZE); new_end = trunc_page(new_end); mapped = pmap_map(&vaddr, new_end, end, VM_PROT_READ | VM_PROT_WRITE); bzero((void *)mapped, end - new_end); uma_startup((void *)mapped, boot_pages); #if defined(__aarch64__) || defined(__amd64__) || defined(__arm__) || \ defined(__i386__) || defined(__mips__) /* * Allocate a bitmap to indicate that a random physical page * needs to be included in a minidump. * * The amd64 port needs this to indicate which direct map pages * need to be dumped, via calls to dump_add_page()/dump_drop_page(). * * However, i386 still needs this workspace internally within the * minidump code. In theory, they are not needed on i386, but are * included should the sf_buf code decide to use them. */ last_pa = 0; for (i = 0; dump_avail[i + 1] != 0; i += 2) if (dump_avail[i + 1] > last_pa) last_pa = dump_avail[i + 1]; page_range = last_pa / PAGE_SIZE; vm_page_dump_size = round_page(roundup2(page_range, NBBY) / NBBY); new_end -= vm_page_dump_size; vm_page_dump = (void *)(uintptr_t)pmap_map(&vaddr, new_end, new_end + vm_page_dump_size, VM_PROT_READ | VM_PROT_WRITE); bzero((void *)vm_page_dump, vm_page_dump_size); #endif #ifdef __amd64__ /* * Request that the physical pages underlying the message buffer be * included in a crash dump. Since the message buffer is accessed * through the direct map, they are not automatically included. */ pa = DMAP_TO_PHYS((vm_offset_t)msgbufp->msg_ptr); last_pa = pa + round_page(msgbufsize); while (pa < last_pa) { dump_add_page(pa); pa += PAGE_SIZE; } #endif /* * Compute the number of pages of memory that will be available for * use (taking into account the overhead of a page structure per * page). */ first_page = low_water / PAGE_SIZE; #ifdef VM_PHYSSEG_SPARSE page_range = 0; for (i = 0; i < vm_phys_nsegs; i++) { page_range += atop(vm_phys_segs[i].end - vm_phys_segs[i].start); } for (i = 0; phys_avail[i + 1] != 0; i += 2) page_range += atop(phys_avail[i + 1] - phys_avail[i]); #elif defined(VM_PHYSSEG_DENSE) page_range = high_water / PAGE_SIZE - first_page; #else #error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined." #endif end = new_end; /* * Reserve an unmapped guard page to trap access to vm_page_array[-1]. */ vaddr += PAGE_SIZE; /* * Initialize the mem entry structures now, and put them in the free * queue. */ new_end = trunc_page(end - page_range * sizeof(struct vm_page)); mapped = pmap_map(&vaddr, new_end, end, VM_PROT_READ | VM_PROT_WRITE); vm_page_array = (vm_page_t) mapped; #if VM_NRESERVLEVEL > 0 /* * Allocate memory for the reservation management system's data * structures. */ new_end = vm_reserv_startup(&vaddr, new_end, high_water); #endif #if defined(__aarch64__) || defined(__amd64__) || defined(__mips__) /* * pmap_map on arm64, amd64, and mips can come out of the direct-map, * not kvm like i386, so the pages must be tracked for a crashdump to * include this data. This includes the vm_page_array and the early * UMA bootstrap pages. */ for (pa = new_end; pa < phys_avail[biggestone + 1]; pa += PAGE_SIZE) dump_add_page(pa); #endif phys_avail[biggestone + 1] = new_end; /* * Add physical memory segments corresponding to the available * physical pages. */ for (i = 0; phys_avail[i + 1] != 0; i += 2) vm_phys_add_seg(phys_avail[i], phys_avail[i + 1]); /* * Clear all of the page structures */ bzero((caddr_t) vm_page_array, page_range * sizeof(struct vm_page)); for (i = 0; i < page_range; i++) vm_page_array[i].order = VM_NFREEORDER; vm_page_array_size = page_range; /* * Initialize the physical memory allocator. */ vm_phys_init(); /* * Add every available physical page that is not blacklisted to * the free lists. */ vm_cnt.v_page_count = 0; vm_cnt.v_free_count = 0; for (i = 0; phys_avail[i + 1] != 0; i += 2) { pa = phys_avail[i]; last_pa = phys_avail[i + 1]; while (pa < last_pa) { vm_phys_add_page(pa); pa += PAGE_SIZE; } } TAILQ_INIT(&blacklist_head); vm_page_blacklist_load(&list, &listend); vm_page_blacklist_check(list, listend); list = kern_getenv("vm.blacklist"); vm_page_blacklist_check(list, NULL); freeenv(list); #if VM_NRESERVLEVEL > 0 /* * Initialize the reservation management system. */ vm_reserv_init(); #endif return (vaddr); } void vm_page_reference(vm_page_t m) { vm_page_aflag_set(m, PGA_REFERENCED); } /* * vm_page_busy_downgrade: * * Downgrade an exclusive busy page into a single shared busy page. */ void vm_page_busy_downgrade(vm_page_t m) { u_int x; vm_page_assert_xbusied(m); for (;;) { x = m->busy_lock; x &= VPB_BIT_WAITERS; if (atomic_cmpset_rel_int(&m->busy_lock, VPB_SINGLE_EXCLUSIVER | x, VPB_SHARERS_WORD(1) | x)) break; } } /* * vm_page_sbusied: * * Return a positive value if the page is shared busied, 0 otherwise. */ int vm_page_sbusied(vm_page_t m) { u_int x; x = m->busy_lock; return ((x & VPB_BIT_SHARED) != 0 && x != VPB_UNBUSIED); } /* * vm_page_sunbusy: * * Shared unbusy a page. */ void vm_page_sunbusy(vm_page_t m) { u_int x; vm_page_assert_sbusied(m); for (;;) { x = m->busy_lock; if (VPB_SHARERS(x) > 1) { if (atomic_cmpset_int(&m->busy_lock, x, x - VPB_ONE_SHARER)) break; continue; } if ((x & VPB_BIT_WAITERS) == 0) { KASSERT(x == VPB_SHARERS_WORD(1), ("vm_page_sunbusy: invalid lock state")); if (atomic_cmpset_int(&m->busy_lock, VPB_SHARERS_WORD(1), VPB_UNBUSIED)) break; continue; } KASSERT(x == (VPB_SHARERS_WORD(1) | VPB_BIT_WAITERS), ("vm_page_sunbusy: invalid lock state for waiters")); vm_page_lock(m); if (!atomic_cmpset_int(&m->busy_lock, x, VPB_UNBUSIED)) { vm_page_unlock(m); continue; } wakeup(m); vm_page_unlock(m); break; } } /* * vm_page_busy_sleep: * * Sleep and release the page lock, using the page pointer as wchan. * This is used to implement the hard-path of busying mechanism. * * The given page must be locked. */ void vm_page_busy_sleep(vm_page_t m, const char *wmesg) { u_int x; vm_page_lock_assert(m, MA_OWNED); x = m->busy_lock; if (x == VPB_UNBUSIED) { vm_page_unlock(m); return; } if ((x & VPB_BIT_WAITERS) == 0 && !atomic_cmpset_int(&m->busy_lock, x, x | VPB_BIT_WAITERS)) { vm_page_unlock(m); return; } msleep(m, vm_page_lockptr(m), PVM | PDROP, wmesg, 0); } /* * vm_page_trysbusy: * * Try to shared busy a page. * If the operation succeeds 1 is returned otherwise 0. * The operation never sleeps. */ int vm_page_trysbusy(vm_page_t m) { u_int x; for (;;) { x = m->busy_lock; if ((x & VPB_BIT_SHARED) == 0) return (0); if (atomic_cmpset_acq_int(&m->busy_lock, x, x + VPB_ONE_SHARER)) return (1); } } /* * vm_page_xunbusy_hard: * * Called after the first try the exclusive unbusy of a page failed. * It is assumed that the waiters bit is on. */ void vm_page_xunbusy_hard(vm_page_t m) { vm_page_assert_xbusied(m); vm_page_lock(m); atomic_store_rel_int(&m->busy_lock, VPB_UNBUSIED); wakeup(m); vm_page_unlock(m); } /* * vm_page_flash: * * Wakeup anyone waiting for the page. * The ownership bits do not change. * * The given page must be locked. */ void vm_page_flash(vm_page_t m) { u_int x; vm_page_lock_assert(m, MA_OWNED); for (;;) { x = m->busy_lock; if ((x & VPB_BIT_WAITERS) == 0) return; if (atomic_cmpset_int(&m->busy_lock, x, x & (~VPB_BIT_WAITERS))) break; } wakeup(m); } /* * Keep page from being freed by the page daemon * much of the same effect as wiring, except much lower * overhead and should be used only for *very* temporary * holding ("wiring"). */ void vm_page_hold(vm_page_t mem) { vm_page_lock_assert(mem, MA_OWNED); mem->hold_count++; } void vm_page_unhold(vm_page_t mem) { vm_page_lock_assert(mem, MA_OWNED); KASSERT(mem->hold_count >= 1, ("vm_page_unhold: hold count < 0!!!")); --mem->hold_count; if (mem->hold_count == 0 && (mem->flags & PG_UNHOLDFREE) != 0) vm_page_free_toq(mem); } /* * vm_page_unhold_pages: * * Unhold each of the pages that is referenced by the given array. */ void vm_page_unhold_pages(vm_page_t *ma, int count) { struct mtx *mtx, *new_mtx; mtx = NULL; for (; count != 0; count--) { /* * Avoid releasing and reacquiring the same page lock. */ new_mtx = vm_page_lockptr(*ma); if (mtx != new_mtx) { if (mtx != NULL) mtx_unlock(mtx); mtx = new_mtx; mtx_lock(mtx); } vm_page_unhold(*ma); ma++; } if (mtx != NULL) mtx_unlock(mtx); } vm_page_t PHYS_TO_VM_PAGE(vm_paddr_t pa) { vm_page_t m; #ifdef VM_PHYSSEG_SPARSE m = vm_phys_paddr_to_vm_page(pa); if (m == NULL) m = vm_phys_fictitious_to_vm_page(pa); return (m); #elif defined(VM_PHYSSEG_DENSE) long pi; pi = atop(pa); if (pi >= first_page && (pi - first_page) < vm_page_array_size) { m = &vm_page_array[pi - first_page]; return (m); } return (vm_phys_fictitious_to_vm_page(pa)); #else #error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined." #endif } /* * vm_page_getfake: * * Create a fictitious page with the specified physical address and * memory attribute. The memory attribute is the only the machine- * dependent aspect of a fictitious page that must be initialized. */ vm_page_t vm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr) { vm_page_t m; m = uma_zalloc(fakepg_zone, M_WAITOK | M_ZERO); vm_page_initfake(m, paddr, memattr); return (m); } void vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr) { if ((m->flags & PG_FICTITIOUS) != 0) { /* * The page's memattr might have changed since the * previous initialization. Update the pmap to the * new memattr. */ goto memattr; } m->phys_addr = paddr; m->queue = PQ_NONE; /* Fictitious pages don't use "segind". */ m->flags = PG_FICTITIOUS; /* Fictitious pages don't use "order" or "pool". */ m->oflags = VPO_UNMANAGED; m->busy_lock = VPB_SINGLE_EXCLUSIVER; m->wire_count = 1; pmap_page_init(m); memattr: pmap_page_set_memattr(m, memattr); } /* * vm_page_putfake: * * Release a fictitious page. */ void vm_page_putfake(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) != 0, ("managed %p", m)); KASSERT((m->flags & PG_FICTITIOUS) != 0, ("vm_page_putfake: bad page %p", m)); uma_zfree(fakepg_zone, m); } /* * vm_page_updatefake: * * Update the given fictitious page to the specified physical address and * memory attribute. */ void vm_page_updatefake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr) { KASSERT((m->flags & PG_FICTITIOUS) != 0, ("vm_page_updatefake: bad page %p", m)); m->phys_addr = paddr; pmap_page_set_memattr(m, memattr); } /* * vm_page_free: * * Free a page. */ void vm_page_free(vm_page_t m) { m->flags &= ~PG_ZERO; vm_page_free_toq(m); } /* * vm_page_free_zero: * * Free a page to the zerod-pages queue */ void vm_page_free_zero(vm_page_t m) { m->flags |= PG_ZERO; vm_page_free_toq(m); } /* * Unbusy and handle the page queueing for a page from the VOP_GETPAGES() * array which was optionally read ahead or behind. */ void vm_page_readahead_finish(vm_page_t m) { /* We shouldn't put invalid pages on queues. */ KASSERT(m->valid != 0, ("%s: %p is invalid", __func__, m)); /* * Since the page is not the actually needed one, whether it should * be activated or deactivated is not obvious. Empirical results * have shown that deactivating the page is usually the best choice, * unless the page is wanted by another thread. */ vm_page_lock(m); if ((m->busy_lock & VPB_BIT_WAITERS) != 0) vm_page_activate(m); else vm_page_deactivate(m); vm_page_unlock(m); vm_page_xunbusy(m); } /* * vm_page_sleep_if_busy: * * Sleep and release the page queues lock if the page is busied. * Returns TRUE if the thread slept. * * The given page must be unlocked and object containing it must * be locked. */ int vm_page_sleep_if_busy(vm_page_t m, const char *msg) { vm_object_t obj; vm_page_lock_assert(m, MA_NOTOWNED); VM_OBJECT_ASSERT_WLOCKED(m->object); if (vm_page_busied(m)) { /* * The page-specific object must be cached because page * identity can change during the sleep, causing the * re-lock of a different object. * It is assumed that a reference to the object is already * held by the callers. */ obj = m->object; vm_page_lock(m); VM_OBJECT_WUNLOCK(obj); vm_page_busy_sleep(m, msg); VM_OBJECT_WLOCK(obj); return (TRUE); } return (FALSE); } /* * vm_page_dirty_KBI: [ internal use only ] * * Set all bits in the page's dirty field. * * The object containing the specified page must be locked if the * call is made from the machine-independent layer. * * See vm_page_clear_dirty_mask(). * * This function should only be called by vm_page_dirty(). */ void vm_page_dirty_KBI(vm_page_t m) { /* These assertions refer to this operation by its public name. */ KASSERT((m->flags & PG_CACHED) == 0, ("vm_page_dirty: page in cache!")); KASSERT(m->valid == VM_PAGE_BITS_ALL, ("vm_page_dirty: page is invalid!")); m->dirty = VM_PAGE_BITS_ALL; } /* * vm_page_insert: [ internal use only ] * * Inserts the given mem entry into the object and object list. * * The object must be locked. */ int vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex) { vm_page_t mpred; VM_OBJECT_ASSERT_WLOCKED(object); mpred = vm_radix_lookup_le(&object->rtree, pindex); return (vm_page_insert_after(m, object, pindex, mpred)); } /* * vm_page_insert_after: * * Inserts the page "m" into the specified object at offset "pindex". * * The page "mpred" must immediately precede the offset "pindex" within * the specified object. * * The object must be locked. */ static int vm_page_insert_after(vm_page_t m, vm_object_t object, vm_pindex_t pindex, vm_page_t mpred) { vm_pindex_t sidx; vm_object_t sobj; vm_page_t msucc; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(m->object == NULL, ("vm_page_insert_after: page already inserted")); if (mpred != NULL) { KASSERT(mpred->object == object, ("vm_page_insert_after: object doesn't contain mpred")); KASSERT(mpred->pindex < pindex, ("vm_page_insert_after: mpred doesn't precede pindex")); msucc = TAILQ_NEXT(mpred, listq); } else msucc = TAILQ_FIRST(&object->memq); if (msucc != NULL) KASSERT(msucc->pindex > pindex, ("vm_page_insert_after: msucc doesn't succeed pindex")); /* * Record the object/offset pair in this page */ sobj = m->object; sidx = m->pindex; m->object = object; m->pindex = pindex; /* * Now link into the object's ordered list of backed pages. */ if (vm_radix_insert(&object->rtree, m)) { m->object = sobj; m->pindex = sidx; return (1); } vm_page_insert_radixdone(m, object, mpred); return (0); } /* * vm_page_insert_radixdone: * * Complete page "m" insertion into the specified object after the * radix trie hooking. * * The page "mpred" must precede the offset "m->pindex" within the * specified object. * * The object must be locked. */ static void vm_page_insert_radixdone(vm_page_t m, vm_object_t object, vm_page_t mpred) { VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object != NULL && m->object == object, ("vm_page_insert_radixdone: page %p has inconsistent object", m)); if (mpred != NULL) { KASSERT(mpred->object == object, ("vm_page_insert_after: object doesn't contain mpred")); KASSERT(mpred->pindex < m->pindex, ("vm_page_insert_after: mpred doesn't precede pindex")); } if (mpred != NULL) TAILQ_INSERT_AFTER(&object->memq, mpred, m, listq); else TAILQ_INSERT_HEAD(&object->memq, m, listq); /* * Show that the object has one more resident page. */ object->resident_page_count++; /* * Hold the vnode until the last page is released. */ if (object->resident_page_count == 1 && object->type == OBJT_VNODE) vhold(object->handle); /* * Since we are inserting a new and possibly dirty page, * update the object's OBJ_MIGHTBEDIRTY flag. */ if (pmap_page_is_write_mapped(m)) vm_object_set_writeable_dirty(object); } /* * vm_page_remove: * * Removes the given mem entry from the object/offset-page * table and the object page list, but do not invalidate/terminate * the backing store. * * The object must be locked. The page must be locked if it is managed. */ void vm_page_remove(vm_page_t m) { vm_object_t object; boolean_t lockacq; if ((m->oflags & VPO_UNMANAGED) == 0) vm_page_lock_assert(m, MA_OWNED); if ((object = m->object) == NULL) return; VM_OBJECT_ASSERT_WLOCKED(object); if (vm_page_xbusied(m)) { lockacq = FALSE; if ((m->oflags & VPO_UNMANAGED) != 0 && !mtx_owned(vm_page_lockptr(m))) { lockacq = TRUE; vm_page_lock(m); } vm_page_flash(m); atomic_store_rel_int(&m->busy_lock, VPB_UNBUSIED); if (lockacq) vm_page_unlock(m); } /* * Now remove from the object's list of backed pages. */ vm_radix_remove(&object->rtree, m->pindex); TAILQ_REMOVE(&object->memq, m, listq); /* * And show that the object has one fewer resident page. */ object->resident_page_count--; /* * The vnode may now be recycled. */ if (object->resident_page_count == 0 && object->type == OBJT_VNODE) vdrop(object->handle); m->object = NULL; } /* * vm_page_lookup: * * Returns the page associated with the object/offset * pair specified; if none is found, NULL is returned. * * The object must be locked. */ vm_page_t vm_page_lookup(vm_object_t object, vm_pindex_t pindex) { VM_OBJECT_ASSERT_LOCKED(object); return (vm_radix_lookup(&object->rtree, pindex)); } /* * vm_page_find_least: * * Returns the page associated with the object with least pindex * greater than or equal to the parameter pindex, or NULL. * * The object must be locked. */ vm_page_t vm_page_find_least(vm_object_t object, vm_pindex_t pindex) { vm_page_t m; VM_OBJECT_ASSERT_LOCKED(object); if ((m = TAILQ_FIRST(&object->memq)) != NULL && m->pindex < pindex) m = vm_radix_lookup_ge(&object->rtree, pindex); return (m); } /* * Returns the given page's successor (by pindex) within the object if it is * resident; if none is found, NULL is returned. * * The object must be locked. */ vm_page_t vm_page_next(vm_page_t m) { vm_page_t next; VM_OBJECT_ASSERT_WLOCKED(m->object); if ((next = TAILQ_NEXT(m, listq)) != NULL && next->pindex != m->pindex + 1) next = NULL; return (next); } /* * Returns the given page's predecessor (by pindex) within the object if it is * resident; if none is found, NULL is returned. * * The object must be locked. */ vm_page_t vm_page_prev(vm_page_t m) { vm_page_t prev; VM_OBJECT_ASSERT_WLOCKED(m->object); if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL && prev->pindex != m->pindex - 1) prev = NULL; return (prev); } /* * Uses the page mnew as a replacement for an existing page at index * pindex which must be already present in the object. * * The existing page must not be on a paging queue. */ vm_page_t vm_page_replace(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex) { vm_page_t mold; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(mnew->object == NULL, ("vm_page_replace: page already in object")); /* * This function mostly follows vm_page_insert() and * vm_page_remove() without the radix, object count and vnode * dance. Double check such functions for more comments. */ mnew->object = object; mnew->pindex = pindex; mold = vm_radix_replace(&object->rtree, mnew); KASSERT(mold->queue == PQ_NONE, ("vm_page_replace: mold is on a paging queue")); /* Keep the resident page list in sorted order. */ TAILQ_INSERT_AFTER(&object->memq, mold, mnew, listq); TAILQ_REMOVE(&object->memq, mold, listq); mold->object = NULL; vm_page_xunbusy(mold); /* * The object's resident_page_count does not change because we have * swapped one page for another, but OBJ_MIGHTBEDIRTY. */ if (pmap_page_is_write_mapped(mnew)) vm_object_set_writeable_dirty(object); return (mold); } /* * vm_page_rename: * * Move the given memory entry from its * current object to the specified target object/offset. * * Note: swap associated with the page must be invalidated by the move. We * have to do this for several reasons: (1) we aren't freeing the * page, (2) we are dirtying the page, (3) the VM system is probably * moving the page from object A to B, and will then later move * the backing store from A to B and we can't have a conflict. * * Note: we *always* dirty the page. It is necessary both for the * fact that we moved it, and because we may be invalidating * swap. If the page is on the cache, we have to deactivate it * or vm_page_dirty() will panic. Dirty pages are not allowed * on the cache. * * The objects must be locked. */ int vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex) { vm_page_t mpred; vm_pindex_t opidx; VM_OBJECT_ASSERT_WLOCKED(new_object); mpred = vm_radix_lookup_le(&new_object->rtree, new_pindex); KASSERT(mpred == NULL || mpred->pindex != new_pindex, ("vm_page_rename: pindex already renamed")); /* * Create a custom version of vm_page_insert() which does not depend * by m_prev and can cheat on the implementation aspects of the * function. */ opidx = m->pindex; m->pindex = new_pindex; if (vm_radix_insert(&new_object->rtree, m)) { m->pindex = opidx; return (1); } /* * The operation cannot fail anymore. The removal must happen before * the listq iterator is tainted. */ m->pindex = opidx; vm_page_lock(m); vm_page_remove(m); /* Return back to the new pindex to complete vm_page_insert(). */ m->pindex = new_pindex; m->object = new_object; vm_page_unlock(m); vm_page_insert_radixdone(m, new_object, mpred); vm_page_dirty(m); return (0); } /* * Convert all of the given object's cached pages that have a * pindex within the given range into free pages. If the value * zero is given for "end", then the range's upper bound is * infinity. If the given object is backed by a vnode and it * transitions from having one or more cached pages to none, the * vnode's hold count is reduced. */ void vm_page_cache_free(vm_object_t object, vm_pindex_t start, vm_pindex_t end) { vm_page_t m; boolean_t empty; mtx_lock(&vm_page_queue_free_mtx); if (__predict_false(vm_radix_is_empty(&object->cache))) { mtx_unlock(&vm_page_queue_free_mtx); return; } while ((m = vm_radix_lookup_ge(&object->cache, start)) != NULL) { if (end != 0 && m->pindex >= end) break; vm_radix_remove(&object->cache, m->pindex); vm_page_cache_turn_free(m); } empty = vm_radix_is_empty(&object->cache); mtx_unlock(&vm_page_queue_free_mtx); if (object->type == OBJT_VNODE && empty) vdrop(object->handle); } /* * Returns the cached page that is associated with the given * object and offset. If, however, none exists, returns NULL. * * The free page queue must be locked. */ static inline vm_page_t vm_page_cache_lookup(vm_object_t object, vm_pindex_t pindex) { mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); return (vm_radix_lookup(&object->cache, pindex)); } /* * Remove the given cached page from its containing object's * collection of cached pages. * * The free page queue must be locked. */ static void vm_page_cache_remove(vm_page_t m) { mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); KASSERT((m->flags & PG_CACHED) != 0, ("vm_page_cache_remove: page %p is not cached", m)); vm_radix_remove(&m->object->cache, m->pindex); m->object = NULL; vm_cnt.v_cache_count--; } /* * Transfer all of the cached pages with offset greater than or * equal to 'offidxstart' from the original object's cache to the * new object's cache. However, any cached pages with offset * greater than or equal to the new object's size are kept in the * original object. Initially, the new object's cache must be * empty. Offset 'offidxstart' in the original object must * correspond to offset zero in the new object. * * The new object must be locked. */ void vm_page_cache_transfer(vm_object_t orig_object, vm_pindex_t offidxstart, vm_object_t new_object) { vm_page_t m; /* * Insertion into an object's collection of cached pages * requires the object to be locked. In contrast, removal does * not. */ VM_OBJECT_ASSERT_WLOCKED(new_object); KASSERT(vm_radix_is_empty(&new_object->cache), ("vm_page_cache_transfer: object %p has cached pages", new_object)); mtx_lock(&vm_page_queue_free_mtx); while ((m = vm_radix_lookup_ge(&orig_object->cache, offidxstart)) != NULL) { /* * Transfer all of the pages with offset greater than or * equal to 'offidxstart' from the original object's * cache to the new object's cache. */ if ((m->pindex - offidxstart) >= new_object->size) break; vm_radix_remove(&orig_object->cache, m->pindex); /* Update the page's object and offset. */ m->object = new_object; m->pindex -= offidxstart; if (vm_radix_insert(&new_object->cache, m)) vm_page_cache_turn_free(m); } mtx_unlock(&vm_page_queue_free_mtx); } /* * Returns TRUE if a cached page is associated with the given object and * offset, and FALSE otherwise. * * The object must be locked. */ boolean_t vm_page_is_cached(vm_object_t object, vm_pindex_t pindex) { vm_page_t m; /* * Insertion into an object's collection of cached pages requires the * object to be locked. Therefore, if the object is locked and the * object's collection is empty, there is no need to acquire the free * page queues lock in order to prove that the specified page doesn't * exist. */ VM_OBJECT_ASSERT_WLOCKED(object); if (__predict_true(vm_object_cache_is_empty(object))) return (FALSE); mtx_lock(&vm_page_queue_free_mtx); m = vm_page_cache_lookup(object, pindex); mtx_unlock(&vm_page_queue_free_mtx); return (m != NULL); } /* * vm_page_alloc: * * Allocate and return a page that is associated with the specified * object and offset pair. By default, this page is exclusive busied. * * The caller must always specify an allocation class. * * allocation classes: * VM_ALLOC_NORMAL normal process request * VM_ALLOC_SYSTEM system *really* needs a page * VM_ALLOC_INTERRUPT interrupt time request * * optional allocation flags: * VM_ALLOC_COUNT(number) the number of additional pages that the caller * intends to allocate * VM_ALLOC_IFCACHED return page only if it is cached * VM_ALLOC_IFNOTCACHED return NULL, do not reactivate if the page * is cached * VM_ALLOC_NOBUSY do not exclusive busy the page * VM_ALLOC_NODUMP do not include the page in a kernel core dump * VM_ALLOC_NOOBJ page is not associated with an object and * should not be exclusive busy * VM_ALLOC_SBUSY shared busy the allocated page * VM_ALLOC_WIRED wire the allocated page * VM_ALLOC_ZERO prefer a zeroed page * * This routine may not sleep. */ vm_page_t vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req) { struct vnode *vp = NULL; vm_object_t m_object; vm_page_t m, mpred; int flags, req_class; mpred = 0; /* XXX: pacify gcc */ KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0) && (object != NULL || (req & VM_ALLOC_SBUSY) == 0) && ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) != (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)), ("vm_page_alloc: inconsistent object(%p)/req(%x)", (void *)object, req)); if (object != NULL) VM_OBJECT_ASSERT_WLOCKED(object); req_class = req & VM_ALLOC_CLASS_MASK; /* * The page daemon is allowed to dig deeper into the free page list. */ if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT) req_class = VM_ALLOC_SYSTEM; if (object != NULL) { mpred = vm_radix_lookup_le(&object->rtree, pindex); KASSERT(mpred == NULL || mpred->pindex != pindex, ("vm_page_alloc: pindex already allocated")); } /* * The page allocation request can came from consumers which already * hold the free page queue mutex, like vm_page_insert() in * vm_page_cache(). */ mtx_lock_flags(&vm_page_queue_free_mtx, MTX_RECURSE); if (vm_cnt.v_free_count + vm_cnt.v_cache_count > vm_cnt.v_free_reserved || (req_class == VM_ALLOC_SYSTEM && vm_cnt.v_free_count + vm_cnt.v_cache_count > vm_cnt.v_interrupt_free_min) || (req_class == VM_ALLOC_INTERRUPT && vm_cnt.v_free_count + vm_cnt.v_cache_count > 0)) { /* * Allocate from the free queue if the number of free pages * exceeds the minimum for the request class. */ if (object != NULL && (m = vm_page_cache_lookup(object, pindex)) != NULL) { if ((req & VM_ALLOC_IFNOTCACHED) != 0) { mtx_unlock(&vm_page_queue_free_mtx); return (NULL); } if (vm_phys_unfree_page(m)) vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m, 0); #if VM_NRESERVLEVEL > 0 else if (!vm_reserv_reactivate_page(m)) #else else #endif panic("vm_page_alloc: cache page %p is missing" " from the free queue", m); } else if ((req & VM_ALLOC_IFCACHED) != 0) { mtx_unlock(&vm_page_queue_free_mtx); return (NULL); #if VM_NRESERVLEVEL > 0 } else if (object == NULL || (object->flags & (OBJ_COLORED | OBJ_FICTITIOUS)) != OBJ_COLORED || (m = vm_reserv_alloc_page(object, pindex, mpred)) == NULL) { #else } else { #endif m = vm_phys_alloc_pages(object != NULL ? VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT, 0); #if VM_NRESERVLEVEL > 0 if (m == NULL && vm_reserv_reclaim_inactive()) { m = vm_phys_alloc_pages(object != NULL ? VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT, 0); } #endif } } else { /* * Not allocatable, give up. */ mtx_unlock(&vm_page_queue_free_mtx); atomic_add_int(&vm_pageout_deficit, max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1)); pagedaemon_wakeup(); return (NULL); } /* * At this point we had better have found a good page. */ KASSERT(m != NULL, ("vm_page_alloc: missing page")); KASSERT(m->queue == PQ_NONE, ("vm_page_alloc: page %p has unexpected queue %d", m, m->queue)); KASSERT(m->wire_count == 0, ("vm_page_alloc: page %p is wired", m)); KASSERT(m->hold_count == 0, ("vm_page_alloc: page %p is held", m)); KASSERT(!vm_page_sbusied(m), ("vm_page_alloc: page %p is busy", m)); KASSERT(m->dirty == 0, ("vm_page_alloc: page %p is dirty", m)); KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT, ("vm_page_alloc: page %p has unexpected memattr %d", m, pmap_page_get_memattr(m))); if ((m->flags & PG_CACHED) != 0) { KASSERT((m->flags & PG_ZERO) == 0, ("vm_page_alloc: cached page %p is PG_ZERO", m)); KASSERT(m->valid != 0, ("vm_page_alloc: cached page %p is invalid", m)); if (m->object == object && m->pindex == pindex) vm_cnt.v_reactivated++; else m->valid = 0; m_object = m->object; vm_page_cache_remove(m); if (m_object->type == OBJT_VNODE && vm_object_cache_is_empty(m_object)) vp = m_object->handle; } else { KASSERT(m->valid == 0, ("vm_page_alloc: free page %p is valid", m)); vm_phys_freecnt_adj(m, -1); if ((m->flags & PG_ZERO) != 0) vm_page_zero_count--; } mtx_unlock(&vm_page_queue_free_mtx); /* * Initialize the page. Only the PG_ZERO flag is inherited. */ flags = 0; if ((req & VM_ALLOC_ZERO) != 0) flags = PG_ZERO; flags &= m->flags; if ((req & VM_ALLOC_NODUMP) != 0) flags |= PG_NODUMP; m->flags = flags; m->aflags = 0; m->oflags = object == NULL || (object->flags & OBJ_UNMANAGED) != 0 ? VPO_UNMANAGED : 0; m->busy_lock = VPB_UNBUSIED; if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ | VM_ALLOC_SBUSY)) == 0) m->busy_lock = VPB_SINGLE_EXCLUSIVER; if ((req & VM_ALLOC_SBUSY) != 0) m->busy_lock = VPB_SHARERS_WORD(1); if (req & VM_ALLOC_WIRED) { /* * The page lock is not required for wiring a page until that * page is inserted into the object. */ atomic_add_int(&vm_cnt.v_wire_count, 1); m->wire_count = 1; } m->act_count = 0; if (object != NULL) { if (vm_page_insert_after(m, object, pindex, mpred)) { /* See the comment below about hold count. */ if (vp != NULL) vdrop(vp); pagedaemon_wakeup(); if (req & VM_ALLOC_WIRED) { atomic_subtract_int(&vm_cnt.v_wire_count, 1); m->wire_count = 0; } m->object = NULL; m->oflags = VPO_UNMANAGED; vm_page_free(m); return (NULL); } /* Ignore device objects; the pager sets "memattr" for them. */ if (object->memattr != VM_MEMATTR_DEFAULT && (object->flags & OBJ_FICTITIOUS) == 0) pmap_page_set_memattr(m, object->memattr); } else m->pindex = pindex; /* * The following call to vdrop() must come after the above call * to vm_page_insert() in case both affect the same object and * vnode. Otherwise, the affected vnode's hold count could * temporarily become zero. */ if (vp != NULL) vdrop(vp); /* * Don't wakeup too often - wakeup the pageout daemon when * we would be nearly out of memory. */ if (vm_paging_needed()) pagedaemon_wakeup(); return (m); } static void vm_page_alloc_contig_vdrop(struct spglist *lst) { while (!SLIST_EMPTY(lst)) { vdrop((struct vnode *)SLIST_FIRST(lst)-> plinks.s.pv); SLIST_REMOVE_HEAD(lst, plinks.s.ss); } } /* * vm_page_alloc_contig: * * Allocate a contiguous set of physical pages of the given size "npages" * from the free lists. All of the physical pages must be at or above * the given physical address "low" and below the given physical address * "high". The given value "alignment" determines the alignment of the * first physical page in the set. If the given value "boundary" is * non-zero, then the set of physical pages cannot cross any physical * address boundary that is a multiple of that value. Both "alignment" * and "boundary" must be a power of two. * * If the specified memory attribute, "memattr", is VM_MEMATTR_DEFAULT, * then the memory attribute setting for the physical pages is configured * to the object's memory attribute setting. Otherwise, the memory * attribute setting for the physical pages is configured to "memattr", * overriding the object's memory attribute setting. However, if the * object's memory attribute setting is not VM_MEMATTR_DEFAULT, then the * memory attribute setting for the physical pages cannot be configured * to VM_MEMATTR_DEFAULT. * * The caller must always specify an allocation class. * * allocation classes: * VM_ALLOC_NORMAL normal process request * VM_ALLOC_SYSTEM system *really* needs a page * VM_ALLOC_INTERRUPT interrupt time request * * optional allocation flags: * VM_ALLOC_NOBUSY do not exclusive busy the page * VM_ALLOC_NODUMP do not include the page in a kernel core dump * VM_ALLOC_NOOBJ page is not associated with an object and * should not be exclusive busy * VM_ALLOC_SBUSY shared busy the allocated page * VM_ALLOC_WIRED wire the allocated page * VM_ALLOC_ZERO prefer a zeroed page * * This routine may not sleep. */ vm_page_t vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, vm_memattr_t memattr) { struct vnode *drop; struct spglist deferred_vdrop_list; vm_page_t m, m_tmp, m_ret; u_int flags; int req_class; KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0) && (object != NULL || (req & VM_ALLOC_SBUSY) == 0) && ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) != (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)), ("vm_page_alloc: inconsistent object(%p)/req(%x)", (void *)object, req)); if (object != NULL) { VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object->type == OBJT_PHYS, ("vm_page_alloc_contig: object %p isn't OBJT_PHYS", object)); } KASSERT(npages > 0, ("vm_page_alloc_contig: npages is zero")); req_class = req & VM_ALLOC_CLASS_MASK; /* * The page daemon is allowed to dig deeper into the free page list. */ if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT) req_class = VM_ALLOC_SYSTEM; SLIST_INIT(&deferred_vdrop_list); mtx_lock(&vm_page_queue_free_mtx); if (vm_cnt.v_free_count + vm_cnt.v_cache_count >= npages + vm_cnt.v_free_reserved || (req_class == VM_ALLOC_SYSTEM && vm_cnt.v_free_count + vm_cnt.v_cache_count >= npages + vm_cnt.v_interrupt_free_min) || (req_class == VM_ALLOC_INTERRUPT && vm_cnt.v_free_count + vm_cnt.v_cache_count >= npages)) { #if VM_NRESERVLEVEL > 0 retry: if (object == NULL || (object->flags & OBJ_COLORED) == 0 || (m_ret = vm_reserv_alloc_contig(object, pindex, npages, low, high, alignment, boundary)) == NULL) #endif m_ret = vm_phys_alloc_contig(npages, low, high, alignment, boundary); } else { mtx_unlock(&vm_page_queue_free_mtx); atomic_add_int(&vm_pageout_deficit, npages); pagedaemon_wakeup(); return (NULL); } if (m_ret != NULL) for (m = m_ret; m < &m_ret[npages]; m++) { drop = vm_page_alloc_init(m); if (drop != NULL) { /* * Enqueue the vnode for deferred vdrop(). */ m->plinks.s.pv = drop; SLIST_INSERT_HEAD(&deferred_vdrop_list, m, plinks.s.ss); } } else { #if VM_NRESERVLEVEL > 0 if (vm_reserv_reclaim_contig(npages, low, high, alignment, boundary)) goto retry; #endif } mtx_unlock(&vm_page_queue_free_mtx); if (m_ret == NULL) return (NULL); /* * Initialize the pages. Only the PG_ZERO flag is inherited. */ flags = 0; if ((req & VM_ALLOC_ZERO) != 0) flags = PG_ZERO; if ((req & VM_ALLOC_NODUMP) != 0) flags |= PG_NODUMP; if ((req & VM_ALLOC_WIRED) != 0) atomic_add_int(&vm_cnt.v_wire_count, npages); if (object != NULL) { if (object->memattr != VM_MEMATTR_DEFAULT && memattr == VM_MEMATTR_DEFAULT) memattr = object->memattr; } for (m = m_ret; m < &m_ret[npages]; m++) { m->aflags = 0; m->flags = (m->flags | PG_NODUMP) & flags; m->busy_lock = VPB_UNBUSIED; if (object != NULL) { if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) == 0) m->busy_lock = VPB_SINGLE_EXCLUSIVER; if ((req & VM_ALLOC_SBUSY) != 0) m->busy_lock = VPB_SHARERS_WORD(1); } if ((req & VM_ALLOC_WIRED) != 0) m->wire_count = 1; /* Unmanaged pages don't use "act_count". */ m->oflags = VPO_UNMANAGED; if (object != NULL) { if (vm_page_insert(m, object, pindex)) { vm_page_alloc_contig_vdrop( &deferred_vdrop_list); if (vm_paging_needed()) pagedaemon_wakeup(); if ((req & VM_ALLOC_WIRED) != 0) atomic_subtract_int(&vm_cnt.v_wire_count, npages); for (m_tmp = m, m = m_ret; m < &m_ret[npages]; m++) { if ((req & VM_ALLOC_WIRED) != 0) m->wire_count = 0; if (m >= m_tmp) m->object = NULL; vm_page_free(m); } return (NULL); } } else m->pindex = pindex; if (memattr != VM_MEMATTR_DEFAULT) pmap_page_set_memattr(m, memattr); pindex++; } vm_page_alloc_contig_vdrop(&deferred_vdrop_list); if (vm_paging_needed()) pagedaemon_wakeup(); return (m_ret); } /* * Initialize a page that has been freshly dequeued from a freelist. * The caller has to drop the vnode returned, if it is not NULL. * * This function may only be used to initialize unmanaged pages. * * To be called with vm_page_queue_free_mtx held. */ static struct vnode * vm_page_alloc_init(vm_page_t m) { struct vnode *drop; vm_object_t m_object; KASSERT(m->queue == PQ_NONE, ("vm_page_alloc_init: page %p has unexpected queue %d", m, m->queue)); KASSERT(m->wire_count == 0, ("vm_page_alloc_init: page %p is wired", m)); KASSERT(m->hold_count == 0, ("vm_page_alloc_init: page %p is held", m)); KASSERT(!vm_page_sbusied(m), ("vm_page_alloc_init: page %p is busy", m)); KASSERT(m->dirty == 0, ("vm_page_alloc_init: page %p is dirty", m)); KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT, ("vm_page_alloc_init: page %p has unexpected memattr %d", m, pmap_page_get_memattr(m))); mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); drop = NULL; if ((m->flags & PG_CACHED) != 0) { KASSERT((m->flags & PG_ZERO) == 0, ("vm_page_alloc_init: cached page %p is PG_ZERO", m)); m->valid = 0; m_object = m->object; vm_page_cache_remove(m); if (m_object->type == OBJT_VNODE && vm_object_cache_is_empty(m_object)) drop = m_object->handle; } else { KASSERT(m->valid == 0, ("vm_page_alloc_init: free page %p is valid", m)); vm_phys_freecnt_adj(m, -1); if ((m->flags & PG_ZERO) != 0) vm_page_zero_count--; } return (drop); } /* * vm_page_alloc_freelist: * * Allocate a physical page from the specified free page list. * * The caller must always specify an allocation class. * * allocation classes: * VM_ALLOC_NORMAL normal process request * VM_ALLOC_SYSTEM system *really* needs a page * VM_ALLOC_INTERRUPT interrupt time request * * optional allocation flags: * VM_ALLOC_COUNT(number) the number of additional pages that the caller * intends to allocate * VM_ALLOC_WIRED wire the allocated page * VM_ALLOC_ZERO prefer a zeroed page * * This routine may not sleep. */ vm_page_t vm_page_alloc_freelist(int flind, int req) { struct vnode *drop; vm_page_t m; u_int flags; int req_class; req_class = req & VM_ALLOC_CLASS_MASK; /* * The page daemon is allowed to dig deeper into the free page list. */ if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT) req_class = VM_ALLOC_SYSTEM; /* * Do not allocate reserved pages unless the req has asked for it. */ mtx_lock_flags(&vm_page_queue_free_mtx, MTX_RECURSE); if (vm_cnt.v_free_count + vm_cnt.v_cache_count > vm_cnt.v_free_reserved || (req_class == VM_ALLOC_SYSTEM && vm_cnt.v_free_count + vm_cnt.v_cache_count > vm_cnt.v_interrupt_free_min) || (req_class == VM_ALLOC_INTERRUPT && vm_cnt.v_free_count + vm_cnt.v_cache_count > 0)) m = vm_phys_alloc_freelist_pages(flind, VM_FREEPOOL_DIRECT, 0); else { mtx_unlock(&vm_page_queue_free_mtx); atomic_add_int(&vm_pageout_deficit, max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1)); pagedaemon_wakeup(); return (NULL); } if (m == NULL) { mtx_unlock(&vm_page_queue_free_mtx); return (NULL); } drop = vm_page_alloc_init(m); mtx_unlock(&vm_page_queue_free_mtx); /* * Initialize the page. Only the PG_ZERO flag is inherited. */ m->aflags = 0; flags = 0; if ((req & VM_ALLOC_ZERO) != 0) flags = PG_ZERO; m->flags &= flags; if ((req & VM_ALLOC_WIRED) != 0) { /* * The page lock is not required for wiring a page that does * not belong to an object. */ atomic_add_int(&vm_cnt.v_wire_count, 1); m->wire_count = 1; } /* Unmanaged pages don't use "act_count". */ m->oflags = VPO_UNMANAGED; if (drop != NULL) vdrop(drop); if (vm_paging_needed()) pagedaemon_wakeup(); return (m); +} + +#define VPSC_ANY 0 /* No restrictions. */ +#define VPSC_NORESERV 1 /* Skip reservations; implies VPSC_NOSUPER. */ +#define VPSC_NOSUPER 2 /* Skip superpages. */ + +/* + * vm_page_scan_contig: + * + * Scan vm_page_array[] between the specified entries "m_start" and + * "m_end" for a run of contiguous physical pages that satisfy the + * specified conditions, and return the lowest page in the run. The + * specified "alignment" determines the alignment of the lowest physical + * page in the run. If the specified "boundary" is non-zero, then the + * run of physical pages cannot span a physical address that is a + * multiple of "boundary". + * + * "m_end" is never dereferenced, so it need not point to a vm_page + * structure within vm_page_array[]. + * + * "npages" must be greater than zero. "m_start" and "m_end" must not + * span a hole (or discontiguity) in the physical address space. Both + * "alignment" and "boundary" must be a power of two. + */ +vm_page_t +vm_page_scan_contig(u_long npages, vm_page_t m_start, vm_page_t m_end, + u_long alignment, vm_paddr_t boundary, int options) +{ + struct mtx *m_mtx, *new_mtx; + vm_object_t object; + vm_paddr_t pa; + vm_page_t m, m_run; +#if VM_NRESERVLEVEL > 0 + int level; +#endif + int m_inc, order, run_ext, run_len; + + KASSERT(npages > 0, ("npages is 0")); + KASSERT(powerof2(alignment), ("alignment is not a power of 2")); + KASSERT(powerof2(boundary), ("boundary is not a power of 2")); + m_run = NULL; + run_len = 0; + m_mtx = NULL; + for (m = m_start; m < m_end && run_len < npages; m += m_inc) { + KASSERT((m->flags & (PG_FICTITIOUS | PG_MARKER)) == 0, + ("page %p is PG_FICTITIOUS or PG_MARKER", m)); + + /* + * If the current page would be the start of a run, check its + * physical address against the end, alignment, and boundary + * conditions. If it doesn't satisfy these conditions, either + * terminate the scan or advance to the next page that + * satisfies the failed condition. + */ + if (run_len == 0) { + KASSERT(m_run == NULL, ("m_run != NULL")); + if (m + npages > m_end) + break; + pa = VM_PAGE_TO_PHYS(m); + if ((pa & (alignment - 1)) != 0) { + m_inc = atop(roundup2(pa, alignment) - pa); + continue; + } + if (((pa ^ (pa + ptoa(npages) - 1)) & ~(boundary - + 1)) != 0) { + m_inc = atop(roundup2(pa, boundary) - pa); + continue; + } + } else + KASSERT(m_run != NULL, ("m_run == NULL")); + + /* + * Avoid releasing and reacquiring the same page lock. + */ + new_mtx = vm_page_lockptr(m); + if (m_mtx != new_mtx) { + if (m_mtx != NULL) + mtx_unlock(m_mtx); + m_mtx = new_mtx; + mtx_lock(m_mtx); + } + m_inc = 1; +retry: + if (m->wire_count != 0 || m->hold_count != 0) + run_ext = 0; +#if VM_NRESERVLEVEL > 0 + else if ((level = vm_reserv_level(m)) >= 0 && + (options & VPSC_NORESERV) != 0) { + run_ext = 0; + /* Advance to the end of the reservation. */ + pa = VM_PAGE_TO_PHYS(m); + m_inc = atop(roundup2(pa + 1, vm_reserv_size(level)) - + pa); + } +#endif + else if ((object = m->object) != NULL) { + /* + * The page is considered eligible for relocation if + * and only if it could be laundered or reclaimed by + * the page daemon. + */ + if (!VM_OBJECT_TRYRLOCK(object)) { + mtx_unlock(m_mtx); + VM_OBJECT_RLOCK(object); + mtx_lock(m_mtx); + if (m->object != object) { + /* + * The page may have been freed. + */ + VM_OBJECT_RUNLOCK(object); + goto retry; + } else if (m->wire_count != 0 || + m->hold_count != 0) { + run_ext = 0; + goto unlock; + } + } + KASSERT((m->flags & PG_UNHOLDFREE) == 0, + ("page %p is PG_UNHOLDFREE", m)); + /* Don't care: PG_NODUMP, PG_WINATCFLS, PG_ZERO. */ + if (object->type != OBJT_DEFAULT && + object->type != OBJT_SWAP && + object->type != OBJT_VNODE) + run_ext = 0; + else if ((m->flags & PG_CACHED) != 0 || + m != vm_page_lookup(object, m->pindex)) { + /* + * The page is cached or recently converted + * from cached to free. + */ +#if VM_NRESERVLEVEL > 0 + if (level >= 0) { + /* + * The page is reserved. Extend the + * current run by one page. + */ + run_ext = 1; + } else +#endif + if ((order = m->order) < VM_NFREEORDER) { + /* + * The page is enqueued in the + * physical memory allocator's cache/ + * free page queues. Moreover, it is + * the first page in a power-of-two- + * sized run of contiguous cache/free + * pages. Add these pages to the end + * of the current run, and jump + * ahead. + */ + run_ext = 1 << order; + m_inc = 1 << order; + } else + run_ext = 0; +#if VM_NRESERVLEVEL > 0 + } else if ((options & VPSC_NOSUPER) != 0 && + (level = vm_reserv_level_iffullpop(m)) >= 0) { + run_ext = 0; + /* Advance to the end of the superpage. */ + pa = VM_PAGE_TO_PHYS(m); + m_inc = atop(roundup2(pa + 1, + vm_reserv_size(level)) - pa); +#endif + } else if (object->memattr == VM_MEMATTR_DEFAULT && + m->queue != PQ_NONE && !vm_page_busied(m)) { + /* + * The page is allocated but eligible for + * relocation. Extend the current run by one + * page. + */ + KASSERT(pmap_page_get_memattr(m) == + VM_MEMATTR_DEFAULT, + ("page %p has an unexpected memattr", m)); + KASSERT((m->oflags & (VPO_SWAPINPROG | + VPO_SWAPSLEEP | VPO_UNMANAGED)) == 0, + ("page %p has unexpected oflags", m)); + /* Don't care: VPO_NOSYNC. */ + run_ext = 1; + } else + run_ext = 0; +unlock: + VM_OBJECT_RUNLOCK(object); +#if VM_NRESERVLEVEL > 0 + } else if (level >= 0) { + /* + * The page is reserved but not yet allocated. In + * other words, it is still cached or free. Extend + * the current run by one page. + */ + run_ext = 1; +#endif + } else if ((order = m->order) < VM_NFREEORDER) { + /* + * The page is enqueued in the physical memory + * allocator's cache/free page queues. Moreover, it + * is the first page in a power-of-two-sized run of + * contiguous cache/free pages. Add these pages to + * the end of the current run, and jump ahead. + */ + run_ext = 1 << order; + m_inc = 1 << order; + } else { + /* + * Skip the page for one of the following reasons: (1) + * It is enqueued in the physical memory allocator's + * cache/free page queues. However, it is not the + * first page in a run of contiguous cache/free pages. + * (This case rarely occurs because the scan is + * performed in ascending order.) (2) It is not + * reserved, and it is transitioning from free to + * allocated. (Conversely, the transition from + * allocated to free for managed pages is blocked by + * the page lock.) (3) It is allocated but not + * contained by an object and not wired, e.g., + * allocated by Xen's balloon driver. + */ + run_ext = 0; + } + + /* + * Extend or reset the current run of pages. + */ + if (run_ext > 0) { + if (run_len == 0) + m_run = m; + run_len += run_ext; + } else { + if (run_len > 0) { + m_run = NULL; + run_len = 0; + } + } + } + if (m_mtx != NULL) + mtx_unlock(m_mtx); + if (run_len >= npages) + return (m_run); + return (NULL); +} + +/* + * vm_page_reclaim_run: + * + * Try to relocate each of the allocated virtual pages within the + * specified run of physical pages to a new physical address. Free the + * physical pages underlying the relocated virtual pages. A virtual page + * is relocatable if and only if it could be laundered or reclaimed by + * the page daemon. Whenever possible, a virtual page is relocated to a + * physical address above "high". + * + * Returns 0 if every physical page within the run was already free or + * just freed by a successful relocation. Otherwise, returns a non-zero + * value indicating why the last attempt to relocate a virtual page was + * unsuccessful. + * + * "req_class" must be an allocation class. + */ +static int +vm_page_reclaim_run(int req_class, u_long npages, vm_page_t m_run, + vm_paddr_t high) +{ + struct mtx *m_mtx, *new_mtx; + struct spglist free; + vm_object_t object; + vm_paddr_t pa; + vm_page_t m, m_end, m_new; + int error, order, req; + + KASSERT((req_class & VM_ALLOC_CLASS_MASK) == req_class, + ("req_class is not an allocation class")); + SLIST_INIT(&free); + error = 0; + m = m_run; + m_end = m_run + npages; + m_mtx = NULL; + for (; error == 0 && m < m_end; m++) { + KASSERT((m->flags & (PG_FICTITIOUS | PG_MARKER)) == 0, + ("page %p is PG_FICTITIOUS or PG_MARKER", m)); + + /* + * Avoid releasing and reacquiring the same page lock. + */ + new_mtx = vm_page_lockptr(m); + if (m_mtx != new_mtx) { + if (m_mtx != NULL) + mtx_unlock(m_mtx); + m_mtx = new_mtx; + mtx_lock(m_mtx); + } +retry: + if (m->wire_count != 0 || m->hold_count != 0) + error = EBUSY; + else if ((object = m->object) != NULL) { + /* + * The page is relocated if and only if it could be + * laundered or reclaimed by the page daemon. + */ + if (!VM_OBJECT_TRYWLOCK(object)) { + mtx_unlock(m_mtx); + VM_OBJECT_WLOCK(object); + mtx_lock(m_mtx); + if (m->object != object) { + /* + * The page may have been freed. + */ + VM_OBJECT_WUNLOCK(object); + goto retry; + } else if (m->wire_count != 0 || + m->hold_count != 0) { + error = EBUSY; + goto unlock; + } + } + KASSERT((m->flags & PG_UNHOLDFREE) == 0, + ("page %p is PG_UNHOLDFREE", m)); + /* Don't care: PG_NODUMP, PG_WINATCFLS, PG_ZERO. */ + if (object->type != OBJT_DEFAULT && + object->type != OBJT_SWAP && + object->type != OBJT_VNODE) + error = EINVAL; + else if ((m->flags & PG_CACHED) != 0 || + m != vm_page_lookup(object, m->pindex)) { + /* + * The page is cached or recently converted + * from cached to free. + */ + VM_OBJECT_WUNLOCK(object); + goto cached; + } else if (object->memattr != VM_MEMATTR_DEFAULT) + error = EINVAL; + else if (m->queue != PQ_NONE && !vm_page_busied(m)) { + KASSERT(pmap_page_get_memattr(m) == + VM_MEMATTR_DEFAULT, + ("page %p has an unexpected memattr", m)); + KASSERT((m->oflags & (VPO_SWAPINPROG | + VPO_SWAPSLEEP | VPO_UNMANAGED)) == 0, + ("page %p has unexpected oflags", m)); + /* Don't care: VPO_NOSYNC. */ + if (m->valid != 0) { + /* + * First, try to allocate a new page + * that is above "high". Failing + * that, try to allocate a new page + * that is below "m_run". Allocate + * the new page between the end of + * "m_run" and "high" only as a last + * resort. + */ + req = req_class | VM_ALLOC_NOOBJ; + if ((m->flags & PG_NODUMP) != 0) + req |= VM_ALLOC_NODUMP; + if (trunc_page(high) != + ~(vm_paddr_t)PAGE_MASK) { + m_new = vm_page_alloc_contig( + NULL, 0, req, 1, + round_page(high), + ~(vm_paddr_t)0, + PAGE_SIZE, 0, + VM_MEMATTR_DEFAULT); + } else + m_new = NULL; + if (m_new == NULL) { + pa = VM_PAGE_TO_PHYS(m_run); + m_new = vm_page_alloc_contig( + NULL, 0, req, 1, + 0, pa - 1, PAGE_SIZE, 0, + VM_MEMATTR_DEFAULT); + } + if (m_new == NULL) { + pa += ptoa(npages); + m_new = vm_page_alloc_contig( + NULL, 0, req, 1, + pa, high, PAGE_SIZE, 0, + VM_MEMATTR_DEFAULT); + } + if (m_new == NULL) { + error = ENOMEM; + goto unlock; + } + KASSERT(m_new->wire_count == 0, + ("page %p is wired", m)); + + /* + * Replace "m" with the new page. For + * vm_page_replace(), "m" must be busy + * and dequeued. Finally, change "m" + * as if vm_page_free() was called. + */ + if (object->ref_count != 0) + pmap_remove_all(m); + m_new->aflags = m->aflags; + KASSERT(m_new->oflags == VPO_UNMANAGED, + ("page %p is managed", m)); + m_new->oflags = m->oflags & VPO_NOSYNC; + pmap_copy_page(m, m_new); + m_new->valid = m->valid; + m_new->dirty = m->dirty; + m->flags &= ~PG_ZERO; + vm_page_xbusy(m); + vm_page_remque(m); + vm_page_replace_checked(m_new, object, + m->pindex, m); + m->valid = 0; + vm_page_undirty(m); + + /* + * The new page must be deactivated + * before the object is unlocked. + */ + new_mtx = vm_page_lockptr(m_new); + if (m_mtx != new_mtx) { + mtx_unlock(m_mtx); + m_mtx = new_mtx; + mtx_lock(m_mtx); + } + vm_page_deactivate(m_new); + } else { + m->flags &= ~PG_ZERO; + vm_page_remque(m); + vm_page_remove(m); + KASSERT(m->dirty == 0, + ("page %p is dirty", m)); + } + SLIST_INSERT_HEAD(&free, m, plinks.s.ss); + } else + error = EBUSY; +unlock: + VM_OBJECT_WUNLOCK(object); + } else { +cached: + mtx_lock(&vm_page_queue_free_mtx); + order = m->order; + if (order < VM_NFREEORDER) { + /* + * The page is enqueued in the physical memory + * allocator's cache/free page queues. + * Moreover, it is the first page in a power- + * of-two-sized run of contiguous cache/free + * pages. Jump ahead to the last page within + * that run, and continue from there. + */ + m += (1 << order) - 1; + } +#if VM_NRESERVLEVEL > 0 + else if (vm_reserv_is_page_free(m)) + order = 0; +#endif + mtx_unlock(&vm_page_queue_free_mtx); + if (order == VM_NFREEORDER) + error = EINVAL; + } + } + if (m_mtx != NULL) + mtx_unlock(m_mtx); + if ((m = SLIST_FIRST(&free)) != NULL) { + mtx_lock(&vm_page_queue_free_mtx); + do { + SLIST_REMOVE_HEAD(&free, plinks.s.ss); + vm_phys_freecnt_adj(m, 1); +#if VM_NRESERVLEVEL > 0 + if (!vm_reserv_free_page(m)) +#else + if (true) +#endif + vm_phys_free_pages(m, 0); + } while ((m = SLIST_FIRST(&free)) != NULL); + vm_page_zero_idle_wakeup(); + vm_page_free_wakeup(); + mtx_unlock(&vm_page_queue_free_mtx); + } + return (error); +} + +#define NRUNS 16 + +CTASSERT(powerof2(NRUNS)); + +#define RUN_INDEX(count) ((count) & (NRUNS - 1)) + +#define MIN_RECLAIM 8 + +/* + * vm_page_reclaim_contig: + * + * Reclaim allocated, contiguous physical memory satisfying the specified + * conditions by relocating the virtual pages using that physical memory. + * Returns true if reclamation is successful and false otherwise. Since + * relocation requires the allocation of physical pages, reclamation may + * fail due to a shortage of cache/free pages. When reclamation fails, + * callers are expected to perform VM_WAIT before retrying a failed + * allocation operation, e.g., vm_page_alloc_contig(). + * + * The caller must always specify an allocation class through "req". + * + * allocation classes: + * VM_ALLOC_NORMAL normal process request + * VM_ALLOC_SYSTEM system *really* needs a page + * VM_ALLOC_INTERRUPT interrupt time request + * + * The optional allocation flags are ignored. + * + * "npages" must be greater than zero. Both "alignment" and "boundary" + * must be a power of two. + */ +bool +vm_page_reclaim_contig(int req, u_long npages, vm_paddr_t low, vm_paddr_t high, + u_long alignment, vm_paddr_t boundary) +{ + vm_paddr_t curr_low; + vm_page_t m_run, m_runs[NRUNS]; + u_long count, reclaimed; + int error, i, options, req_class; + + KASSERT(npages > 0, ("npages is 0")); + KASSERT(powerof2(alignment), ("alignment is not a power of 2")); + KASSERT(powerof2(boundary), ("boundary is not a power of 2")); + req_class = req & VM_ALLOC_CLASS_MASK; + + /* + * The page daemon is allowed to dig deeper into the free page list. + */ + if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT) + req_class = VM_ALLOC_SYSTEM; + + /* + * Return if the number of cached and free pages cannot satisfy the + * requested allocation. + */ + count = vm_cnt.v_free_count + vm_cnt.v_cache_count; + if (count < npages + vm_cnt.v_free_reserved || (count < npages + + vm_cnt.v_interrupt_free_min && req_class == VM_ALLOC_SYSTEM) || + (count < npages && req_class == VM_ALLOC_INTERRUPT)) + return (false); + + /* + * Scan up to three times, relaxing the restrictions ("options") on + * the reclamation of reservations and superpages each time. + */ + for (options = VPSC_NORESERV;;) { + /* + * Find the highest runs that satisfy the given constraints + * and restrictions, and record them in "m_runs". + */ + curr_low = low; + count = 0; + for (;;) { + m_run = vm_phys_scan_contig(npages, curr_low, high, + alignment, boundary, options); + if (m_run == NULL) + break; + curr_low = VM_PAGE_TO_PHYS(m_run) + ptoa(npages); + m_runs[RUN_INDEX(count)] = m_run; + count++; + } + + /* + * Reclaim the highest runs in LIFO (descending) order until + * the number of reclaimed pages, "reclaimed", is at least + * MIN_RECLAIM. Reset "reclaimed" each time because each + * reclamation is idempotent, and runs will (likely) recur + * from one scan to the next as restrictions are relaxed. + */ + reclaimed = 0; + for (i = 0; count > 0 && i < NRUNS; i++) { + count--; + m_run = m_runs[RUN_INDEX(count)]; + error = vm_page_reclaim_run(req_class, npages, m_run, + high); + if (error == 0) { + reclaimed += npages; + if (reclaimed >= MIN_RECLAIM) + return (true); + } + } + + /* + * Either relax the restrictions on the next scan or return if + * the last scan had no restrictions. + */ + if (options == VPSC_NORESERV) + options = VPSC_NOSUPER; + else if (options == VPSC_NOSUPER) + options = VPSC_ANY; + else if (options == VPSC_ANY) + return (reclaimed != 0); + } } /* * vm_wait: (also see VM_WAIT macro) * * Sleep until free pages are available for allocation. * - Called in various places before memory allocations. */ void vm_wait(void) { mtx_lock(&vm_page_queue_free_mtx); if (curproc == pageproc) { vm_pageout_pages_needed = 1; msleep(&vm_pageout_pages_needed, &vm_page_queue_free_mtx, PDROP | PSWP, "VMWait", 0); } else { if (!vm_pages_needed) { vm_pages_needed = 1; wakeup(&vm_pages_needed); } msleep(&vm_cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | PVM, "vmwait", 0); } } /* * vm_waitpfault: (also see VM_WAITPFAULT macro) * * Sleep until free pages are available for allocation. * - Called only in vm_fault so that processes page faulting * can be easily tracked. * - Sleeps at a lower priority than vm_wait() so that vm_wait()ing * processes will be able to grab memory first. Do not change * this balance without careful testing first. */ void vm_waitpfault(void) { mtx_lock(&vm_page_queue_free_mtx); if (!vm_pages_needed) { vm_pages_needed = 1; wakeup(&vm_pages_needed); } msleep(&vm_cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | PUSER, "pfault", 0); } struct vm_pagequeue * vm_page_pagequeue(vm_page_t m) { return (&vm_phys_domain(m)->vmd_pagequeues[m->queue]); } /* * vm_page_dequeue: * * Remove the given page from its current page queue. * * The page must be locked. */ void vm_page_dequeue(vm_page_t m) { struct vm_pagequeue *pq; vm_page_assert_locked(m); KASSERT(m->queue < PQ_COUNT, ("vm_page_dequeue: page %p is not queued", m)); pq = vm_page_pagequeue(m); vm_pagequeue_lock(pq); m->queue = PQ_NONE; TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); vm_pagequeue_cnt_dec(pq); vm_pagequeue_unlock(pq); } /* * vm_page_dequeue_locked: * * Remove the given page from its current page queue. * * The page and page queue must be locked. */ void vm_page_dequeue_locked(vm_page_t m) { struct vm_pagequeue *pq; vm_page_lock_assert(m, MA_OWNED); pq = vm_page_pagequeue(m); vm_pagequeue_assert_locked(pq); m->queue = PQ_NONE; TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); vm_pagequeue_cnt_dec(pq); } /* * vm_page_enqueue: * * Add the given page to the specified page queue. * * The page must be locked. */ static void vm_page_enqueue(uint8_t queue, vm_page_t m) { struct vm_pagequeue *pq; vm_page_lock_assert(m, MA_OWNED); KASSERT(queue < PQ_COUNT, ("vm_page_enqueue: invalid queue %u request for page %p", queue, m)); pq = &vm_phys_domain(m)->vmd_pagequeues[queue]; vm_pagequeue_lock(pq); m->queue = queue; TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); vm_pagequeue_cnt_inc(pq); vm_pagequeue_unlock(pq); } /* * vm_page_requeue: * * Move the given page to the tail of its current page queue. * * The page must be locked. */ void vm_page_requeue(vm_page_t m) { struct vm_pagequeue *pq; vm_page_lock_assert(m, MA_OWNED); KASSERT(m->queue != PQ_NONE, ("vm_page_requeue: page %p is not queued", m)); pq = vm_page_pagequeue(m); vm_pagequeue_lock(pq); TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); vm_pagequeue_unlock(pq); } /* * vm_page_requeue_locked: * * Move the given page to the tail of its current page queue. * * The page queue must be locked. */ void vm_page_requeue_locked(vm_page_t m) { struct vm_pagequeue *pq; KASSERT(m->queue != PQ_NONE, ("vm_page_requeue_locked: page %p is not queued", m)); pq = vm_page_pagequeue(m); vm_pagequeue_assert_locked(pq); TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); } /* * vm_page_activate: * * Put the specified page on the active list (if appropriate). * Ensure that act_count is at least ACT_INIT but do not otherwise * mess with it. * * The page must be locked. */ void vm_page_activate(vm_page_t m) { int queue; vm_page_lock_assert(m, MA_OWNED); if ((queue = m->queue) != PQ_ACTIVE) { if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) { if (m->act_count < ACT_INIT) m->act_count = ACT_INIT; if (queue != PQ_NONE) vm_page_dequeue(m); vm_page_enqueue(PQ_ACTIVE, m); } else KASSERT(queue == PQ_NONE, ("vm_page_activate: wired page %p is queued", m)); } else { if (m->act_count < ACT_INIT) m->act_count = ACT_INIT; } } /* * vm_page_free_wakeup: * * Helper routine for vm_page_free_toq() and vm_page_cache(). This * routine is called when a page has been added to the cache or free * queues. * * The page queues must be locked. */ static inline void vm_page_free_wakeup(void) { mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); /* * if pageout daemon needs pages, then tell it that there are * some free. */ if (vm_pageout_pages_needed && vm_cnt.v_cache_count + vm_cnt.v_free_count >= vm_cnt.v_pageout_free_min) { wakeup(&vm_pageout_pages_needed); vm_pageout_pages_needed = 0; } /* * wakeup processes that are waiting on memory if we hit a * high water mark. And wakeup scheduler process if we have * lots of memory. this process will swapin processes. */ if (vm_pages_needed && !vm_page_count_min()) { vm_pages_needed = 0; wakeup(&vm_cnt.v_free_count); } } /* * Turn a cached page into a free page, by changing its attributes. * Keep the statistics up-to-date. * * The free page queue must be locked. */ static void vm_page_cache_turn_free(vm_page_t m) { mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); m->object = NULL; m->valid = 0; KASSERT((m->flags & PG_CACHED) != 0, ("vm_page_cache_turn_free: page %p is not cached", m)); m->flags &= ~PG_CACHED; vm_cnt.v_cache_count--; vm_phys_freecnt_adj(m, 1); } /* * vm_page_free_toq: * * Returns the given page to the free list, * disassociating it with any VM object. * * The object must be locked. The page must be locked if it is managed. */ void vm_page_free_toq(vm_page_t m) { if ((m->oflags & VPO_UNMANAGED) == 0) { vm_page_lock_assert(m, MA_OWNED); KASSERT(!pmap_page_is_mapped(m), ("vm_page_free_toq: freeing mapped page %p", m)); } else KASSERT(m->queue == PQ_NONE, ("vm_page_free_toq: unmanaged page %p is queued", m)); PCPU_INC(cnt.v_tfree); if (vm_page_sbusied(m)) panic("vm_page_free: freeing busy page %p", m); /* * Unqueue, then remove page. Note that we cannot destroy * the page here because we do not want to call the pager's * callback routine until after we've put the page on the * appropriate free queue. */ vm_page_remque(m); vm_page_remove(m); /* * If fictitious remove object association and * return, otherwise delay object association removal. */ if ((m->flags & PG_FICTITIOUS) != 0) { return; } m->valid = 0; vm_page_undirty(m); if (m->wire_count != 0) panic("vm_page_free: freeing wired page %p", m); if (m->hold_count != 0) { m->flags &= ~PG_ZERO; KASSERT((m->flags & PG_UNHOLDFREE) == 0, ("vm_page_free: freeing PG_UNHOLDFREE page %p", m)); m->flags |= PG_UNHOLDFREE; } else { /* * Restore the default memory attribute to the page. */ if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT) pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT); /* * Insert the page into the physical memory allocator's * cache/free page queues. */ mtx_lock(&vm_page_queue_free_mtx); vm_phys_freecnt_adj(m, 1); #if VM_NRESERVLEVEL > 0 if (!vm_reserv_free_page(m)) #else if (TRUE) #endif vm_phys_free_pages(m, 0); if ((m->flags & PG_ZERO) != 0) ++vm_page_zero_count; else vm_page_zero_idle_wakeup(); vm_page_free_wakeup(); mtx_unlock(&vm_page_queue_free_mtx); } } /* * vm_page_wire: * * Mark this page as wired down by yet * another map, removing it from paging queues * as necessary. * * If the page is fictitious, then its wire count must remain one. * * The page must be locked. */ void vm_page_wire(vm_page_t m) { /* * Only bump the wire statistics if the page is not already wired, * and only unqueue the page if it is on some queue (if it is unmanaged * it is already off the queues). */ vm_page_lock_assert(m, MA_OWNED); if ((m->flags & PG_FICTITIOUS) != 0) { KASSERT(m->wire_count == 1, ("vm_page_wire: fictitious page %p's wire count isn't one", m)); return; } if (m->wire_count == 0) { KASSERT((m->oflags & VPO_UNMANAGED) == 0 || m->queue == PQ_NONE, ("vm_page_wire: unmanaged page %p is queued", m)); vm_page_remque(m); atomic_add_int(&vm_cnt.v_wire_count, 1); } m->wire_count++; KASSERT(m->wire_count != 0, ("vm_page_wire: wire_count overflow m=%p", m)); } /* * vm_page_unwire: * * Release one wiring of the specified page, potentially allowing it to be * paged out. Returns TRUE if the number of wirings transitions to zero and * FALSE otherwise. * * Only managed pages belonging to an object can be paged out. If the number * of wirings transitions to zero and the page is eligible for page out, then * the page is added to the specified paging queue (unless PQ_NONE is * specified). * * If a page is fictitious, then its wire count must always be one. * * A managed page must be locked. */ boolean_t vm_page_unwire(vm_page_t m, uint8_t queue) { KASSERT(queue < PQ_COUNT || queue == PQ_NONE, ("vm_page_unwire: invalid queue %u request for page %p", queue, m)); if ((m->oflags & VPO_UNMANAGED) == 0) vm_page_assert_locked(m); if ((m->flags & PG_FICTITIOUS) != 0) { KASSERT(m->wire_count == 1, ("vm_page_unwire: fictitious page %p's wire count isn't one", m)); return (FALSE); } if (m->wire_count > 0) { m->wire_count--; if (m->wire_count == 0) { atomic_subtract_int(&vm_cnt.v_wire_count, 1); if ((m->oflags & VPO_UNMANAGED) == 0 && m->object != NULL && queue != PQ_NONE) { if (queue == PQ_INACTIVE) m->flags &= ~PG_WINATCFLS; vm_page_enqueue(queue, m); } return (TRUE); } else return (FALSE); } else panic("vm_page_unwire: page %p's wire count is zero", m); } /* * Move the specified page to the inactive queue. * * Many pages placed on the inactive queue should actually go * into the cache, but it is difficult to figure out which. What * we do instead, if the inactive target is well met, is to put * clean pages at the head of the inactive queue instead of the tail. * This will cause them to be moved to the cache more quickly and * if not actively re-referenced, reclaimed more quickly. If we just * stick these pages at the end of the inactive queue, heavy filesystem * meta-data accesses can cause an unnecessary paging load on memory bound * processes. This optimization causes one-time-use metadata to be * reused more quickly. * * Normally noreuse is FALSE, resulting in LRU operation. noreuse is set * to TRUE if we want this page to be 'as if it were placed in the cache', * except without unmapping it from the process address space. In * practice this is implemented by inserting the page at the head of the * queue, using a marker page to guide FIFO insertion ordering. * * The page must be locked. */ static inline void _vm_page_deactivate(vm_page_t m, boolean_t noreuse) { struct vm_pagequeue *pq; int queue; vm_page_assert_locked(m); /* * Ignore if the page is already inactive, unless it is unlikely to be * reactivated. */ if ((queue = m->queue) == PQ_INACTIVE && !noreuse) return; if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) { pq = &vm_phys_domain(m)->vmd_pagequeues[PQ_INACTIVE]; /* Avoid multiple acquisitions of the inactive queue lock. */ if (queue == PQ_INACTIVE) { vm_pagequeue_lock(pq); vm_page_dequeue_locked(m); } else { if (queue != PQ_NONE) vm_page_dequeue(m); m->flags &= ~PG_WINATCFLS; vm_pagequeue_lock(pq); } m->queue = PQ_INACTIVE; if (noreuse) TAILQ_INSERT_BEFORE(&vm_phys_domain(m)->vmd_inacthead, m, plinks.q); else TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); vm_pagequeue_cnt_inc(pq); vm_pagequeue_unlock(pq); } } /* * Move the specified page to the inactive queue. * * The page must be locked. */ void vm_page_deactivate(vm_page_t m) { _vm_page_deactivate(m, FALSE); } /* * Move the specified page to the inactive queue with the expectation * that it is unlikely to be reused. * * The page must be locked. */ void vm_page_deactivate_noreuse(vm_page_t m) { _vm_page_deactivate(m, TRUE); } /* * vm_page_try_to_cache: * * Returns 0 on failure, 1 on success */ int vm_page_try_to_cache(vm_page_t m) { vm_page_lock_assert(m, MA_OWNED); VM_OBJECT_ASSERT_WLOCKED(m->object); if (m->dirty || m->hold_count || m->wire_count || (m->oflags & VPO_UNMANAGED) != 0 || vm_page_busied(m)) return (0); pmap_remove_all(m); if (m->dirty) return (0); vm_page_cache(m); return (1); } /* * vm_page_try_to_free() * * Attempt to free the page. If we cannot free it, we do nothing. * 1 is returned on success, 0 on failure. */ int vm_page_try_to_free(vm_page_t m) { vm_page_lock_assert(m, MA_OWNED); if (m->object != NULL) VM_OBJECT_ASSERT_WLOCKED(m->object); if (m->dirty || m->hold_count || m->wire_count || (m->oflags & VPO_UNMANAGED) != 0 || vm_page_busied(m)) return (0); pmap_remove_all(m); if (m->dirty) return (0); vm_page_free(m); return (1); } /* * vm_page_cache * * Put the specified page onto the page cache queue (if appropriate). * * The object and page must be locked. */ void vm_page_cache(vm_page_t m) { vm_object_t object; boolean_t cache_was_empty; vm_page_lock_assert(m, MA_OWNED); object = m->object; VM_OBJECT_ASSERT_WLOCKED(object); if (vm_page_busied(m) || (m->oflags & VPO_UNMANAGED) || m->hold_count || m->wire_count) panic("vm_page_cache: attempting to cache busy page"); KASSERT(!pmap_page_is_mapped(m), ("vm_page_cache: page %p is mapped", m)); KASSERT(m->dirty == 0, ("vm_page_cache: page %p is dirty", m)); if (m->valid == 0 || object->type == OBJT_DEFAULT || (object->type == OBJT_SWAP && !vm_pager_has_page(object, m->pindex, NULL, NULL))) { /* * Hypothesis: A cache-eligible page belonging to a * default object or swap object but without a backing * store must be zero filled. */ vm_page_free(m); return; } KASSERT((m->flags & PG_CACHED) == 0, ("vm_page_cache: page %p is already cached", m)); /* * Remove the page from the paging queues. */ vm_page_remque(m); /* * Remove the page from the object's collection of resident * pages. */ vm_radix_remove(&object->rtree, m->pindex); TAILQ_REMOVE(&object->memq, m, listq); object->resident_page_count--; /* * Restore the default memory attribute to the page. */ if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT) pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT); /* * Insert the page into the object's collection of cached pages * and the physical memory allocator's cache/free page queues. */ m->flags &= ~PG_ZERO; mtx_lock(&vm_page_queue_free_mtx); cache_was_empty = vm_radix_is_empty(&object->cache); if (vm_radix_insert(&object->cache, m)) { mtx_unlock(&vm_page_queue_free_mtx); if (object->resident_page_count == 0) vdrop(object->handle); m->object = NULL; vm_page_free(m); return; } /* * The above call to vm_radix_insert() could reclaim the one pre- * existing cached page from this object, resulting in a call to * vdrop(). */ if (!cache_was_empty) cache_was_empty = vm_radix_is_singleton(&object->cache); m->flags |= PG_CACHED; vm_cnt.v_cache_count++; PCPU_INC(cnt.v_tcached); #if VM_NRESERVLEVEL > 0 if (!vm_reserv_free_page(m)) { #else if (TRUE) { #endif vm_phys_free_pages(m, 0); } vm_page_free_wakeup(); mtx_unlock(&vm_page_queue_free_mtx); /* * Increment the vnode's hold count if this is the object's only * cached page. Decrement the vnode's hold count if this was * the object's only resident page. */ if (object->type == OBJT_VNODE) { if (cache_was_empty && object->resident_page_count != 0) vhold(object->handle); else if (!cache_was_empty && object->resident_page_count == 0) vdrop(object->handle); } } /* * vm_page_advise * * Deactivate or do nothing, as appropriate. * * The object and page must be locked. */ void vm_page_advise(vm_page_t m, int advice) { vm_page_assert_locked(m); VM_OBJECT_ASSERT_WLOCKED(m->object); if (advice == MADV_FREE) /* * Mark the page clean. This will allow the page to be freed * up by the system. However, such pages are often reused * quickly by malloc() so we do not do anything that would * cause a page fault if we can help it. * * Specifically, we do not try to actually free the page now * nor do we try to put it in the cache (which would cause a * page fault on reuse). * * But we do make the page as freeable as we can without * actually taking the step of unmapping it. */ m->dirty = 0; else if (advice != MADV_DONTNEED) return; /* * Clear any references to the page. Otherwise, the page daemon will * immediately reactivate the page. */ vm_page_aflag_clear(m, PGA_REFERENCED); if (advice != MADV_FREE && m->dirty == 0 && pmap_is_modified(m)) vm_page_dirty(m); /* * Place clean pages at the head of the inactive queue rather than the * tail, thus defeating the queue's LRU operation and ensuring that the * page will be reused quickly. */ _vm_page_deactivate(m, m->dirty == 0); } /* * Grab a page, waiting until we are waken up due to the page * changing state. We keep on waiting, if the page continues * to be in the object. If the page doesn't exist, first allocate it * and then conditionally zero it. * * This routine may sleep. * * The object must be locked on entry. The lock will, however, be released * and reacquired if the routine sleeps. */ vm_page_t vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags) { vm_page_t m; int sleep; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT((allocflags & VM_ALLOC_SBUSY) == 0 || (allocflags & VM_ALLOC_IGN_SBUSY) != 0, ("vm_page_grab: VM_ALLOC_SBUSY/VM_ALLOC_IGN_SBUSY mismatch")); retrylookup: if ((m = vm_page_lookup(object, pindex)) != NULL) { sleep = (allocflags & VM_ALLOC_IGN_SBUSY) != 0 ? vm_page_xbusied(m) : vm_page_busied(m); if (sleep) { if ((allocflags & VM_ALLOC_NOWAIT) != 0) return (NULL); /* * Reference the page before unlocking and * sleeping so that the page daemon is less * likely to reclaim it. */ vm_page_aflag_set(m, PGA_REFERENCED); vm_page_lock(m); VM_OBJECT_WUNLOCK(object); vm_page_busy_sleep(m, "pgrbwt"); VM_OBJECT_WLOCK(object); goto retrylookup; } else { if ((allocflags & VM_ALLOC_WIRED) != 0) { vm_page_lock(m); vm_page_wire(m); vm_page_unlock(m); } if ((allocflags & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) == 0) vm_page_xbusy(m); if ((allocflags & VM_ALLOC_SBUSY) != 0) vm_page_sbusy(m); return (m); } } m = vm_page_alloc(object, pindex, allocflags); if (m == NULL) { if ((allocflags & VM_ALLOC_NOWAIT) != 0) return (NULL); VM_OBJECT_WUNLOCK(object); VM_WAIT; VM_OBJECT_WLOCK(object); goto retrylookup; } else if (m->valid != 0) return (m); if (allocflags & VM_ALLOC_ZERO && (m->flags & PG_ZERO) == 0) pmap_zero_page(m); return (m); } /* * Mapping function for valid or dirty bits in a page. * * Inputs are required to range within a page. */ vm_page_bits_t vm_page_bits(int base, int size) { int first_bit; int last_bit; KASSERT( base + size <= PAGE_SIZE, ("vm_page_bits: illegal base/size %d/%d", base, size) ); if (size == 0) /* handle degenerate case */ return (0); first_bit = base >> DEV_BSHIFT; last_bit = (base + size - 1) >> DEV_BSHIFT; return (((vm_page_bits_t)2 << last_bit) - ((vm_page_bits_t)1 << first_bit)); } /* * vm_page_set_valid_range: * * Sets portions of a page valid. The arguments are expected * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive * of any partial chunks touched by the range. The invalid portion of * such chunks will be zeroed. * * (base + size) must be less then or equal to PAGE_SIZE. */ void vm_page_set_valid_range(vm_page_t m, int base, int size) { int endoff, frag; VM_OBJECT_ASSERT_WLOCKED(m->object); if (size == 0) /* handle degenerate case */ return; /* * If the base is not DEV_BSIZE aligned and the valid * bit is clear, we have to zero out a portion of the * first block. */ if ((frag = base & ~(DEV_BSIZE - 1)) != base && (m->valid & (1 << (base >> DEV_BSHIFT))) == 0) pmap_zero_page_area(m, frag, base - frag); /* * If the ending offset is not DEV_BSIZE aligned and the * valid bit is clear, we have to zero out a portion of * the last block. */ endoff = base + size; if ((frag = endoff & ~(DEV_BSIZE - 1)) != endoff && (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0) pmap_zero_page_area(m, endoff, DEV_BSIZE - (endoff & (DEV_BSIZE - 1))); /* * Assert that no previously invalid block that is now being validated * is already dirty. */ KASSERT((~m->valid & vm_page_bits(base, size) & m->dirty) == 0, ("vm_page_set_valid_range: page %p is dirty", m)); /* * Set valid bits inclusive of any overlap. */ m->valid |= vm_page_bits(base, size); } /* * Clear the given bits from the specified page's dirty field. */ static __inline void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits) { uintptr_t addr; #if PAGE_SIZE < 16384 int shift; #endif /* * If the object is locked and the page is neither exclusive busy nor * write mapped, then the page's dirty field cannot possibly be * set by a concurrent pmap operation. */ VM_OBJECT_ASSERT_WLOCKED(m->object); if (!vm_page_xbusied(m) && !pmap_page_is_write_mapped(m)) m->dirty &= ~pagebits; else { /* * The pmap layer can call vm_page_dirty() without * holding a distinguished lock. The combination of * the object's lock and an atomic operation suffice * to guarantee consistency of the page dirty field. * * For PAGE_SIZE == 32768 case, compiler already * properly aligns the dirty field, so no forcible * alignment is needed. Only require existence of * atomic_clear_64 when page size is 32768. */ addr = (uintptr_t)&m->dirty; #if PAGE_SIZE == 32768 atomic_clear_64((uint64_t *)addr, pagebits); #elif PAGE_SIZE == 16384 atomic_clear_32((uint32_t *)addr, pagebits); #else /* PAGE_SIZE <= 8192 */ /* * Use a trick to perform a 32-bit atomic on the * containing aligned word, to not depend on the existence * of atomic_clear_{8, 16}. */ shift = addr & (sizeof(uint32_t) - 1); #if BYTE_ORDER == BIG_ENDIAN shift = (sizeof(uint32_t) - sizeof(m->dirty) - shift) * NBBY; #else shift *= NBBY; #endif addr &= ~(sizeof(uint32_t) - 1); atomic_clear_32((uint32_t *)addr, pagebits << shift); #endif /* PAGE_SIZE */ } } /* * vm_page_set_validclean: * * Sets portions of a page valid and clean. The arguments are expected * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive * of any partial chunks touched by the range. The invalid portion of * such chunks will be zero'd. * * (base + size) must be less then or equal to PAGE_SIZE. */ void vm_page_set_validclean(vm_page_t m, int base, int size) { vm_page_bits_t oldvalid, pagebits; int endoff, frag; VM_OBJECT_ASSERT_WLOCKED(m->object); if (size == 0) /* handle degenerate case */ return; /* * If the base is not DEV_BSIZE aligned and the valid * bit is clear, we have to zero out a portion of the * first block. */ if ((frag = base & ~(DEV_BSIZE - 1)) != base && (m->valid & ((vm_page_bits_t)1 << (base >> DEV_BSHIFT))) == 0) pmap_zero_page_area(m, frag, base - frag); /* * If the ending offset is not DEV_BSIZE aligned and the * valid bit is clear, we have to zero out a portion of * the last block. */ endoff = base + size; if ((frag = endoff & ~(DEV_BSIZE - 1)) != endoff && (m->valid & ((vm_page_bits_t)1 << (endoff >> DEV_BSHIFT))) == 0) pmap_zero_page_area(m, endoff, DEV_BSIZE - (endoff & (DEV_BSIZE - 1))); /* * Set valid, clear dirty bits. If validating the entire * page we can safely clear the pmap modify bit. We also * use this opportunity to clear the VPO_NOSYNC flag. If a process * takes a write fault on a MAP_NOSYNC memory area the flag will * be set again. * * We set valid bits inclusive of any overlap, but we can only * clear dirty bits for DEV_BSIZE chunks that are fully within * the range. */ oldvalid = m->valid; pagebits = vm_page_bits(base, size); m->valid |= pagebits; #if 0 /* NOT YET */ if ((frag = base & (DEV_BSIZE - 1)) != 0) { frag = DEV_BSIZE - frag; base += frag; size -= frag; if (size < 0) size = 0; } pagebits = vm_page_bits(base, size & (DEV_BSIZE - 1)); #endif if (base == 0 && size == PAGE_SIZE) { /* * The page can only be modified within the pmap if it is * mapped, and it can only be mapped if it was previously * fully valid. */ if (oldvalid == VM_PAGE_BITS_ALL) /* * Perform the pmap_clear_modify() first. Otherwise, * a concurrent pmap operation, such as * pmap_protect(), could clear a modification in the * pmap and set the dirty field on the page before * pmap_clear_modify() had begun and after the dirty * field was cleared here. */ pmap_clear_modify(m); m->dirty = 0; m->oflags &= ~VPO_NOSYNC; } else if (oldvalid != VM_PAGE_BITS_ALL) m->dirty &= ~pagebits; else vm_page_clear_dirty_mask(m, pagebits); } void vm_page_clear_dirty(vm_page_t m, int base, int size) { vm_page_clear_dirty_mask(m, vm_page_bits(base, size)); } /* * vm_page_set_invalid: * * Invalidates DEV_BSIZE'd chunks within a page. Both the * valid and dirty bits for the effected areas are cleared. */ void vm_page_set_invalid(vm_page_t m, int base, int size) { vm_page_bits_t bits; vm_object_t object; object = m->object; VM_OBJECT_ASSERT_WLOCKED(object); if (object->type == OBJT_VNODE && base == 0 && IDX_TO_OFF(m->pindex) + size >= object->un_pager.vnp.vnp_size) bits = VM_PAGE_BITS_ALL; else bits = vm_page_bits(base, size); if (object->ref_count != 0 && m->valid == VM_PAGE_BITS_ALL && bits != 0) pmap_remove_all(m); KASSERT((bits == 0 && m->valid == VM_PAGE_BITS_ALL) || !pmap_page_is_mapped(m), ("vm_page_set_invalid: page %p is mapped", m)); m->valid &= ~bits; m->dirty &= ~bits; } /* * vm_page_zero_invalid() * * The kernel assumes that the invalid portions of a page contain * garbage, but such pages can be mapped into memory by user code. * When this occurs, we must zero out the non-valid portions of the * page so user code sees what it expects. * * Pages are most often semi-valid when the end of a file is mapped * into memory and the file's size is not page aligned. */ void vm_page_zero_invalid(vm_page_t m, boolean_t setvalid) { int b; int i; VM_OBJECT_ASSERT_WLOCKED(m->object); /* * Scan the valid bits looking for invalid sections that * must be zeroed. Invalid sub-DEV_BSIZE'd areas ( where the * valid bit may be set ) have already been zeroed by * vm_page_set_validclean(). */ for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) { if (i == (PAGE_SIZE / DEV_BSIZE) || (m->valid & ((vm_page_bits_t)1 << i))) { if (i > b) { pmap_zero_page_area(m, b << DEV_BSHIFT, (i - b) << DEV_BSHIFT); } b = i + 1; } } /* * setvalid is TRUE when we can safely set the zero'd areas * as being valid. We can do this if there are no cache consistancy * issues. e.g. it is ok to do with UFS, but not ok to do with NFS. */ if (setvalid) m->valid = VM_PAGE_BITS_ALL; } /* * vm_page_is_valid: * * Is (partial) page valid? Note that the case where size == 0 * will return FALSE in the degenerate case where the page is * entirely invalid, and TRUE otherwise. */ int vm_page_is_valid(vm_page_t m, int base, int size) { vm_page_bits_t bits; VM_OBJECT_ASSERT_LOCKED(m->object); bits = vm_page_bits(base, size); return (m->valid != 0 && (m->valid & bits) == bits); } /* * vm_page_ps_is_valid: * * Returns TRUE if the entire (super)page is valid and FALSE otherwise. */ boolean_t vm_page_ps_is_valid(vm_page_t m) { int i, npages; VM_OBJECT_ASSERT_LOCKED(m->object); npages = atop(pagesizes[m->psind]); /* * The physically contiguous pages that make up a superpage, i.e., a * page with a page size index ("psind") greater than zero, will * occupy adjacent entries in vm_page_array[]. */ for (i = 0; i < npages; i++) { if (m[i].valid != VM_PAGE_BITS_ALL) return (FALSE); } return (TRUE); } /* * Set the page's dirty bits if the page is modified. */ void vm_page_test_dirty(vm_page_t m) { VM_OBJECT_ASSERT_WLOCKED(m->object); if (m->dirty != VM_PAGE_BITS_ALL && pmap_is_modified(m)) vm_page_dirty(m); } void vm_page_lock_KBI(vm_page_t m, const char *file, int line) { mtx_lock_flags_(vm_page_lockptr(m), 0, file, line); } void vm_page_unlock_KBI(vm_page_t m, const char *file, int line) { mtx_unlock_flags_(vm_page_lockptr(m), 0, file, line); } int vm_page_trylock_KBI(vm_page_t m, const char *file, int line) { return (mtx_trylock_flags_(vm_page_lockptr(m), 0, file, line)); } #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT) void vm_page_assert_locked_KBI(vm_page_t m, const char *file, int line) { vm_page_lock_assert_KBI(m, MA_OWNED, file, line); } void vm_page_lock_assert_KBI(vm_page_t m, int a, const char *file, int line) { mtx_assert_(vm_page_lockptr(m), a, file, line); } #endif #ifdef INVARIANTS void vm_page_object_lock_assert(vm_page_t m) { /* * Certain of the page's fields may only be modified by the * holder of the containing object's lock or the exclusive busy. * holder. Unfortunately, the holder of the write busy is * not recorded, and thus cannot be checked here. */ if (m->object != NULL && !vm_page_xbusied(m)) VM_OBJECT_ASSERT_WLOCKED(m->object); } void vm_page_assert_pga_writeable(vm_page_t m, uint8_t bits) { if ((bits & PGA_WRITEABLE) == 0) return; /* * The PGA_WRITEABLE flag can only be set if the page is * managed, is exclusively busied or the object is locked. * Currently, this flag is only set by pmap_enter(). */ KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("PGA_WRITEABLE on unmanaged page")); if (!vm_page_xbusied(m)) VM_OBJECT_ASSERT_LOCKED(m->object); } #endif #include "opt_ddb.h" #ifdef DDB #include #include DB_SHOW_COMMAND(page, vm_page_print_page_info) { db_printf("vm_cnt.v_free_count: %d\n", vm_cnt.v_free_count); db_printf("vm_cnt.v_cache_count: %d\n", vm_cnt.v_cache_count); db_printf("vm_cnt.v_inactive_count: %d\n", vm_cnt.v_inactive_count); db_printf("vm_cnt.v_active_count: %d\n", vm_cnt.v_active_count); db_printf("vm_cnt.v_wire_count: %d\n", vm_cnt.v_wire_count); db_printf("vm_cnt.v_free_reserved: %d\n", vm_cnt.v_free_reserved); db_printf("vm_cnt.v_free_min: %d\n", vm_cnt.v_free_min); db_printf("vm_cnt.v_free_target: %d\n", vm_cnt.v_free_target); db_printf("vm_cnt.v_inactive_target: %d\n", vm_cnt.v_inactive_target); } DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info) { int dom; db_printf("pq_free %d pq_cache %d\n", vm_cnt.v_free_count, vm_cnt.v_cache_count); for (dom = 0; dom < vm_ndomains; dom++) { db_printf( "dom %d page_cnt %d free %d pq_act %d pq_inact %d pass %d\n", dom, vm_dom[dom].vmd_page_count, vm_dom[dom].vmd_free_count, vm_dom[dom].vmd_pagequeues[PQ_ACTIVE].pq_cnt, vm_dom[dom].vmd_pagequeues[PQ_INACTIVE].pq_cnt, vm_dom[dom].vmd_pass); } } DB_SHOW_COMMAND(pginfo, vm_page_print_pginfo) { vm_page_t m; boolean_t phys; if (!have_addr) { db_printf("show pginfo addr\n"); return; } phys = strchr(modif, 'p') != NULL; if (phys) m = PHYS_TO_VM_PAGE(addr); else m = (vm_page_t)addr; db_printf( "page %p obj %p pidx 0x%jx phys 0x%jx q %d hold %d wire %d\n" " af 0x%x of 0x%x f 0x%x act %d busy %x valid 0x%x dirty 0x%x\n", m, m->object, (uintmax_t)m->pindex, (uintmax_t)m->phys_addr, m->queue, m->hold_count, m->wire_count, m->aflags, m->oflags, m->flags, m->act_count, m->busy_lock, m->valid, m->dirty); } #endif /* DDB */ Index: head/sys/vm/vm_page.h =================================================================== --- head/sys/vm/vm_page.h (revision 292468) +++ head/sys/vm/vm_page.h (revision 292469) @@ -1,697 +1,701 @@ /*- * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_page.h 8.2 (Berkeley) 12/13/93 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * * $FreeBSD$ */ /* * Resident memory system definitions. */ #ifndef _VM_PAGE_ #define _VM_PAGE_ #include /* * Management of resident (logical) pages. * * A small structure is kept for each resident * page, indexed by page number. Each structure * is an element of several collections: * * A radix tree used to quickly * perform object/offset lookups * * A list of all pages for a given object, * so they can be quickly deactivated at * time of deallocation. * * An ordered list of pages due for pageout. * * In addition, the structure contains the object * and offset to which this page belongs (for pageout), * and sundry status bits. * * In general, operations on this structure's mutable fields are * synchronized using either one of or a combination of the lock on the * object that the page belongs to (O), the pool lock for the page (P), * or the lock for either the free or paging queue (Q). If a field is * annotated below with two of these locks, then holding either lock is * sufficient for read access, but both locks are required for write * access. * * In contrast, the synchronization of accesses to the page's * dirty field is machine dependent (M). In the * machine-independent layer, the lock on the object that the * page belongs to must be held in order to operate on the field. * However, the pmap layer is permitted to set all bits within * the field without holding that lock. If the underlying * architecture does not support atomic read-modify-write * operations on the field's type, then the machine-independent * layer uses a 32-bit atomic on the aligned 32-bit word that * contains the dirty field. In the machine-independent layer, * the implementation of read-modify-write operations on the * field is encapsulated in vm_page_clear_dirty_mask(). */ #if PAGE_SIZE == 4096 #define VM_PAGE_BITS_ALL 0xffu typedef uint8_t vm_page_bits_t; #elif PAGE_SIZE == 8192 #define VM_PAGE_BITS_ALL 0xffffu typedef uint16_t vm_page_bits_t; #elif PAGE_SIZE == 16384 #define VM_PAGE_BITS_ALL 0xffffffffu typedef uint32_t vm_page_bits_t; #elif PAGE_SIZE == 32768 #define VM_PAGE_BITS_ALL 0xfffffffffffffffflu typedef uint64_t vm_page_bits_t; #endif struct vm_page { union { TAILQ_ENTRY(vm_page) q; /* page queue or free list (Q) */ struct { SLIST_ENTRY(vm_page) ss; /* private slists */ void *pv; } s; struct { u_long p; u_long v; } memguard; } plinks; TAILQ_ENTRY(vm_page) listq; /* pages in same object (O) */ vm_object_t object; /* which object am I in (O,P) */ vm_pindex_t pindex; /* offset into object (O,P) */ vm_paddr_t phys_addr; /* physical address of page */ struct md_page md; /* machine dependant stuff */ u_int wire_count; /* wired down maps refs (P) */ volatile u_int busy_lock; /* busy owners lock */ uint16_t hold_count; /* page hold count (P) */ uint16_t flags; /* page PG_* flags (P) */ uint8_t aflags; /* access is atomic */ uint8_t oflags; /* page VPO_* flags (O) */ uint8_t queue; /* page queue index (P,Q) */ int8_t psind; /* pagesizes[] index (O) */ int8_t segind; uint8_t order; /* index of the buddy queue */ uint8_t pool; u_char act_count; /* page usage count (P) */ /* NOTE that these must support one bit per DEV_BSIZE in a page */ /* so, on normal X86 kernels, they must be at least 8 bits wide */ vm_page_bits_t valid; /* map of valid DEV_BSIZE chunks (O) */ vm_page_bits_t dirty; /* map of dirty DEV_BSIZE chunks (M) */ }; /* * Page flags stored in oflags: * * Access to these page flags is synchronized by the lock on the object * containing the page (O). * * Note: VPO_UNMANAGED (used by OBJT_DEVICE, OBJT_PHYS and OBJT_SG) * indicates that the page is not under PV management but * otherwise should be treated as a normal page. Pages not * under PV management cannot be paged out via the * object/vm_page_t because there is no knowledge of their pte * mappings, and such pages are also not on any PQ queue. * */ #define VPO_UNUSED01 0x01 /* --available-- */ #define VPO_SWAPSLEEP 0x02 /* waiting for swap to finish */ #define VPO_UNMANAGED 0x04 /* no PV management for page */ #define VPO_SWAPINPROG 0x08 /* swap I/O in progress on page */ #define VPO_NOSYNC 0x10 /* do not collect for syncer */ /* * Busy page implementation details. * The algorithm is taken mostly by rwlock(9) and sx(9) locks implementation, * even if the support for owner identity is removed because of size * constraints. Checks on lock recursion are then not possible, while the * lock assertions effectiveness is someway reduced. */ #define VPB_BIT_SHARED 0x01 #define VPB_BIT_EXCLUSIVE 0x02 #define VPB_BIT_WAITERS 0x04 #define VPB_BIT_FLAGMASK \ (VPB_BIT_SHARED | VPB_BIT_EXCLUSIVE | VPB_BIT_WAITERS) #define VPB_SHARERS_SHIFT 3 #define VPB_SHARERS(x) \ (((x) & ~VPB_BIT_FLAGMASK) >> VPB_SHARERS_SHIFT) #define VPB_SHARERS_WORD(x) ((x) << VPB_SHARERS_SHIFT | VPB_BIT_SHARED) #define VPB_ONE_SHARER (1 << VPB_SHARERS_SHIFT) #define VPB_SINGLE_EXCLUSIVER VPB_BIT_EXCLUSIVE #define VPB_UNBUSIED VPB_SHARERS_WORD(0) #define PQ_NONE 255 #define PQ_INACTIVE 0 #define PQ_ACTIVE 1 #define PQ_COUNT 2 TAILQ_HEAD(pglist, vm_page); SLIST_HEAD(spglist, vm_page); struct vm_pagequeue { struct mtx pq_mutex; struct pglist pq_pl; int pq_cnt; int * const pq_vcnt; const char * const pq_name; } __aligned(CACHE_LINE_SIZE); struct vm_domain { struct vm_pagequeue vmd_pagequeues[PQ_COUNT]; u_int vmd_page_count; u_int vmd_free_count; long vmd_segs; /* bitmask of the segments */ boolean_t vmd_oom; int vmd_pass; /* local pagedaemon pass */ int vmd_oom_seq; int vmd_last_active_scan; struct vm_page vmd_marker; /* marker for pagedaemon private use */ struct vm_page vmd_inacthead; /* marker for LRU-defeating insertions */ }; extern struct vm_domain vm_dom[MAXMEMDOM]; #define vm_pagequeue_assert_locked(pq) mtx_assert(&(pq)->pq_mutex, MA_OWNED) #define vm_pagequeue_lock(pq) mtx_lock(&(pq)->pq_mutex) #define vm_pagequeue_unlock(pq) mtx_unlock(&(pq)->pq_mutex) #ifdef _KERNEL static __inline void vm_pagequeue_cnt_add(struct vm_pagequeue *pq, int addend) { #ifdef notyet vm_pagequeue_assert_locked(pq); #endif pq->pq_cnt += addend; atomic_add_int(pq->pq_vcnt, addend); } #define vm_pagequeue_cnt_inc(pq) vm_pagequeue_cnt_add((pq), 1) #define vm_pagequeue_cnt_dec(pq) vm_pagequeue_cnt_add((pq), -1) #endif /* _KERNEL */ extern struct mtx_padalign vm_page_queue_free_mtx; extern struct mtx_padalign pa_lock[]; #if defined(__arm__) #define PDRSHIFT PDR_SHIFT #elif !defined(PDRSHIFT) #define PDRSHIFT 21 #endif #define pa_index(pa) ((pa) >> PDRSHIFT) #define PA_LOCKPTR(pa) ((struct mtx *)(&pa_lock[pa_index(pa) % PA_LOCK_COUNT])) #define PA_LOCKOBJPTR(pa) ((struct lock_object *)PA_LOCKPTR((pa))) #define PA_LOCK(pa) mtx_lock(PA_LOCKPTR(pa)) #define PA_TRYLOCK(pa) mtx_trylock(PA_LOCKPTR(pa)) #define PA_UNLOCK(pa) mtx_unlock(PA_LOCKPTR(pa)) #define PA_UNLOCK_COND(pa) \ do { \ if ((pa) != 0) { \ PA_UNLOCK((pa)); \ (pa) = 0; \ } \ } while (0) #define PA_LOCK_ASSERT(pa, a) mtx_assert(PA_LOCKPTR(pa), (a)) #ifdef KLD_MODULE #define vm_page_lock(m) vm_page_lock_KBI((m), LOCK_FILE, LOCK_LINE) #define vm_page_unlock(m) vm_page_unlock_KBI((m), LOCK_FILE, LOCK_LINE) #define vm_page_trylock(m) vm_page_trylock_KBI((m), LOCK_FILE, LOCK_LINE) #else /* !KLD_MODULE */ #define vm_page_lockptr(m) (PA_LOCKPTR(VM_PAGE_TO_PHYS((m)))) #define vm_page_lock(m) mtx_lock(vm_page_lockptr((m))) #define vm_page_unlock(m) mtx_unlock(vm_page_lockptr((m))) #define vm_page_trylock(m) mtx_trylock(vm_page_lockptr((m))) #endif #if defined(INVARIANTS) #define vm_page_assert_locked(m) \ vm_page_assert_locked_KBI((m), __FILE__, __LINE__) #define vm_page_lock_assert(m, a) \ vm_page_lock_assert_KBI((m), (a), __FILE__, __LINE__) #else #define vm_page_assert_locked(m) #define vm_page_lock_assert(m, a) #endif /* * The vm_page's aflags are updated using atomic operations. To set or clear * these flags, the functions vm_page_aflag_set() and vm_page_aflag_clear() * must be used. Neither these flags nor these functions are part of the KBI. * * PGA_REFERENCED may be cleared only if the page is locked. It is set by * both the MI and MD VM layers. However, kernel loadable modules should not * directly set this flag. They should call vm_page_reference() instead. * * PGA_WRITEABLE is set exclusively on managed pages by pmap_enter(). * When it does so, the object must be locked, or the page must be * exclusive busied. The MI VM layer must never access this flag * directly. Instead, it should call pmap_page_is_write_mapped(). * * PGA_EXECUTABLE may be set by pmap routines, and indicates that a page has * at least one executable mapping. It is not consumed by the MI VM layer. */ #define PGA_WRITEABLE 0x01 /* page may be mapped writeable */ #define PGA_REFERENCED 0x02 /* page has been referenced */ #define PGA_EXECUTABLE 0x04 /* page may be mapped executable */ /* * Page flags. If changed at any other time than page allocation or * freeing, the modification must be protected by the vm_page lock. */ #define PG_CACHED 0x0001 /* page is cached */ #define PG_FICTITIOUS 0x0004 /* physical page doesn't exist */ #define PG_ZERO 0x0008 /* page is zeroed */ #define PG_MARKER 0x0010 /* special queue marker page */ #define PG_WINATCFLS 0x0040 /* flush dirty page on inactive q */ #define PG_NODUMP 0x0080 /* don't include this page in a dump */ #define PG_UNHOLDFREE 0x0100 /* delayed free of a held page */ /* * Misc constants. */ #define ACT_DECLINE 1 #define ACT_ADVANCE 3 #define ACT_INIT 5 #define ACT_MAX 64 #ifdef _KERNEL #include #include /* * Each pageable resident page falls into one of four lists: * * free * Available for allocation now. * * cache * Almost available for allocation. Still associated with * an object, but clean and immediately freeable. * * The following lists are LRU sorted: * * inactive * Low activity, candidates for reclamation. * This is the list of pages that should be * paged out next. * * active * Pages that are "active" i.e. they have been * recently referenced. * */ extern int vm_page_zero_count; extern vm_page_t vm_page_array; /* First resident page in table */ extern long vm_page_array_size; /* number of vm_page_t's */ extern long first_page; /* first physical page number */ #define VM_PAGE_TO_PHYS(entry) ((entry)->phys_addr) /* * PHYS_TO_VM_PAGE() returns the vm_page_t object that represents a memory * page to which the given physical address belongs. The correct vm_page_t * object is returned for addresses that are not page-aligned. */ vm_page_t PHYS_TO_VM_PAGE(vm_paddr_t pa); /* * Page allocation parameters for vm_page for the functions * vm_page_alloc(), vm_page_grab(), vm_page_alloc_contig() and * vm_page_alloc_freelist(). Some functions support only a subset * of the flags, and ignore others, see the flags legend. * * Bits 0 - 1 define class. * Bits 2 - 15 dedicated for flags. * Legend: * (a) - vm_page_alloc() supports the flag. * (c) - vm_page_alloc_contig() supports the flag. * (f) - vm_page_alloc_freelist() supports the flag. * (g) - vm_page_grab() supports the flag. * Bits above 15 define the count of additional pages that the caller * intends to allocate. */ #define VM_ALLOC_NORMAL 0 #define VM_ALLOC_INTERRUPT 1 #define VM_ALLOC_SYSTEM 2 #define VM_ALLOC_CLASS_MASK 3 #define VM_ALLOC_WIRED 0x0020 /* (acfg) Allocate non pageable page */ #define VM_ALLOC_ZERO 0x0040 /* (acfg) Try to obtain a zeroed page */ #define VM_ALLOC_NOOBJ 0x0100 /* (acg) No associated object */ #define VM_ALLOC_NOBUSY 0x0200 /* (acg) Do not busy the page */ #define VM_ALLOC_IFCACHED 0x0400 /* (ag) Fail if page is not cached */ #define VM_ALLOC_IFNOTCACHED 0x0800 /* (ag) Fail if page is cached */ #define VM_ALLOC_IGN_SBUSY 0x1000 /* (g) Ignore shared busy flag */ #define VM_ALLOC_NODUMP 0x2000 /* (ag) don't include in dump */ #define VM_ALLOC_SBUSY 0x4000 /* (acg) Shared busy the page */ #define VM_ALLOC_NOWAIT 0x8000 /* (g) Do not sleep, return NULL */ #define VM_ALLOC_COUNT_SHIFT 16 #define VM_ALLOC_COUNT(count) ((count) << VM_ALLOC_COUNT_SHIFT) #ifdef M_NOWAIT static inline int malloc2vm_flags(int malloc_flags) { int pflags; KASSERT((malloc_flags & M_USE_RESERVE) == 0 || (malloc_flags & M_NOWAIT) != 0, ("M_USE_RESERVE requires M_NOWAIT")); pflags = (malloc_flags & M_USE_RESERVE) != 0 ? VM_ALLOC_INTERRUPT : VM_ALLOC_SYSTEM; if ((malloc_flags & M_ZERO) != 0) pflags |= VM_ALLOC_ZERO; if ((malloc_flags & M_NODUMP) != 0) pflags |= VM_ALLOC_NODUMP; return (pflags); } #endif void vm_page_busy_downgrade(vm_page_t m); void vm_page_busy_sleep(vm_page_t m, const char *msg); void vm_page_flash(vm_page_t m); void vm_page_hold(vm_page_t mem); void vm_page_unhold(vm_page_t mem); void vm_page_free(vm_page_t m); void vm_page_free_zero(vm_page_t m); void vm_page_activate (vm_page_t); void vm_page_advise(vm_page_t m, int advice); vm_page_t vm_page_alloc (vm_object_t, vm_pindex_t, int); vm_page_t vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, vm_memattr_t memattr); vm_page_t vm_page_alloc_freelist(int, int); vm_page_t vm_page_grab (vm_object_t, vm_pindex_t, int); void vm_page_cache(vm_page_t); void vm_page_cache_free(vm_object_t, vm_pindex_t, vm_pindex_t); void vm_page_cache_transfer(vm_object_t, vm_pindex_t, vm_object_t); int vm_page_try_to_cache (vm_page_t); int vm_page_try_to_free (vm_page_t); void vm_page_deactivate (vm_page_t); void vm_page_deactivate_noreuse(vm_page_t); void vm_page_dequeue(vm_page_t m); void vm_page_dequeue_locked(vm_page_t m); vm_page_t vm_page_find_least(vm_object_t, vm_pindex_t); vm_page_t vm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr); void vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr); int vm_page_insert (vm_page_t, vm_object_t, vm_pindex_t); boolean_t vm_page_is_cached(vm_object_t object, vm_pindex_t pindex); vm_page_t vm_page_lookup (vm_object_t, vm_pindex_t); vm_page_t vm_page_next(vm_page_t m); int vm_page_pa_tryrelock(pmap_t, vm_paddr_t, vm_paddr_t *); struct vm_pagequeue *vm_page_pagequeue(vm_page_t m); vm_page_t vm_page_prev(vm_page_t m); boolean_t vm_page_ps_is_valid(vm_page_t m); void vm_page_putfake(vm_page_t m); void vm_page_readahead_finish(vm_page_t m); +bool vm_page_reclaim_contig(int req, u_long npages, vm_paddr_t low, + vm_paddr_t high, u_long alignment, vm_paddr_t boundary); void vm_page_reference(vm_page_t m); void vm_page_remove (vm_page_t); int vm_page_rename (vm_page_t, vm_object_t, vm_pindex_t); vm_page_t vm_page_replace(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex); void vm_page_requeue(vm_page_t m); void vm_page_requeue_locked(vm_page_t m); int vm_page_sbusied(vm_page_t m); +vm_page_t vm_page_scan_contig(u_long npages, vm_page_t m_start, + vm_page_t m_end, u_long alignment, vm_paddr_t boundary, int options); void vm_page_set_valid_range(vm_page_t m, int base, int size); int vm_page_sleep_if_busy(vm_page_t m, const char *msg); vm_offset_t vm_page_startup(vm_offset_t vaddr); void vm_page_sunbusy(vm_page_t m); int vm_page_trysbusy(vm_page_t m); void vm_page_unhold_pages(vm_page_t *ma, int count); boolean_t vm_page_unwire(vm_page_t m, uint8_t queue); void vm_page_updatefake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr); void vm_page_wire (vm_page_t); void vm_page_xunbusy_hard(vm_page_t m); void vm_page_set_validclean (vm_page_t, int, int); void vm_page_clear_dirty (vm_page_t, int, int); void vm_page_set_invalid (vm_page_t, int, int); int vm_page_is_valid (vm_page_t, int, int); void vm_page_test_dirty (vm_page_t); vm_page_bits_t vm_page_bits(int base, int size); void vm_page_zero_invalid(vm_page_t m, boolean_t setvalid); void vm_page_free_toq(vm_page_t m); void vm_page_zero_idle_wakeup(void); void vm_page_dirty_KBI(vm_page_t m); void vm_page_lock_KBI(vm_page_t m, const char *file, int line); void vm_page_unlock_KBI(vm_page_t m, const char *file, int line); int vm_page_trylock_KBI(vm_page_t m, const char *file, int line); #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT) void vm_page_assert_locked_KBI(vm_page_t m, const char *file, int line); void vm_page_lock_assert_KBI(vm_page_t m, int a, const char *file, int line); #endif #define vm_page_assert_sbusied(m) \ KASSERT(vm_page_sbusied(m), \ ("vm_page_assert_sbusied: page %p not shared busy @ %s:%d", \ (m), __FILE__, __LINE__)) #define vm_page_assert_unbusied(m) \ KASSERT(!vm_page_busied(m), \ ("vm_page_assert_unbusied: page %p busy @ %s:%d", \ (m), __FILE__, __LINE__)) #define vm_page_assert_xbusied(m) \ KASSERT(vm_page_xbusied(m), \ ("vm_page_assert_xbusied: page %p not exclusive busy @ %s:%d", \ (m), __FILE__, __LINE__)) #define vm_page_busied(m) \ ((m)->busy_lock != VPB_UNBUSIED) #define vm_page_sbusy(m) do { \ if (!vm_page_trysbusy(m)) \ panic("%s: page %p failed shared busying", __func__, \ (m)); \ } while (0) #define vm_page_tryxbusy(m) \ (atomic_cmpset_acq_int(&(m)->busy_lock, VPB_UNBUSIED, \ VPB_SINGLE_EXCLUSIVER)) #define vm_page_xbusied(m) \ (((m)->busy_lock & VPB_SINGLE_EXCLUSIVER) != 0) #define vm_page_xbusy(m) do { \ if (!vm_page_tryxbusy(m)) \ panic("%s: page %p failed exclusive busying", __func__, \ (m)); \ } while (0) #define vm_page_xunbusy(m) do { \ if (!atomic_cmpset_rel_int(&(m)->busy_lock, \ VPB_SINGLE_EXCLUSIVER, VPB_UNBUSIED)) \ vm_page_xunbusy_hard(m); \ } while (0) #ifdef INVARIANTS void vm_page_object_lock_assert(vm_page_t m); #define VM_PAGE_OBJECT_LOCK_ASSERT(m) vm_page_object_lock_assert(m) void vm_page_assert_pga_writeable(vm_page_t m, uint8_t bits); #define VM_PAGE_ASSERT_PGA_WRITEABLE(m, bits) \ vm_page_assert_pga_writeable(m, bits) #else #define VM_PAGE_OBJECT_LOCK_ASSERT(m) (void)0 #define VM_PAGE_ASSERT_PGA_WRITEABLE(m, bits) (void)0 #endif /* * We want to use atomic updates for the aflags field, which is 8 bits wide. * However, not all architectures support atomic operations on 8-bit * destinations. In order that we can easily use a 32-bit operation, we * require that the aflags field be 32-bit aligned. */ CTASSERT(offsetof(struct vm_page, aflags) % sizeof(uint32_t) == 0); /* * Clear the given bits in the specified page. */ static inline void vm_page_aflag_clear(vm_page_t m, uint8_t bits) { uint32_t *addr, val; /* * The PGA_REFERENCED flag can only be cleared if the page is locked. */ if ((bits & PGA_REFERENCED) != 0) vm_page_assert_locked(m); /* * Access the whole 32-bit word containing the aflags field with an * atomic update. Parallel non-atomic updates to the other fields * within this word are handled properly by the atomic update. */ addr = (void *)&m->aflags; KASSERT(((uintptr_t)addr & (sizeof(uint32_t) - 1)) == 0, ("vm_page_aflag_clear: aflags is misaligned")); val = bits; #if BYTE_ORDER == BIG_ENDIAN val <<= 24; #endif atomic_clear_32(addr, val); } /* * Set the given bits in the specified page. */ static inline void vm_page_aflag_set(vm_page_t m, uint8_t bits) { uint32_t *addr, val; VM_PAGE_ASSERT_PGA_WRITEABLE(m, bits); /* * Access the whole 32-bit word containing the aflags field with an * atomic update. Parallel non-atomic updates to the other fields * within this word are handled properly by the atomic update. */ addr = (void *)&m->aflags; KASSERT(((uintptr_t)addr & (sizeof(uint32_t) - 1)) == 0, ("vm_page_aflag_set: aflags is misaligned")); val = bits; #if BYTE_ORDER == BIG_ENDIAN val <<= 24; #endif atomic_set_32(addr, val); } /* * vm_page_dirty: * * Set all bits in the page's dirty field. * * The object containing the specified page must be locked if the * call is made from the machine-independent layer. * * See vm_page_clear_dirty_mask(). */ static __inline void vm_page_dirty(vm_page_t m) { /* Use vm_page_dirty_KBI() under INVARIANTS to save memory. */ #if defined(KLD_MODULE) || defined(INVARIANTS) vm_page_dirty_KBI(m); #else m->dirty = VM_PAGE_BITS_ALL; #endif } /* * vm_page_remque: * * If the given page is in a page queue, then remove it from that page * queue. * * The page must be locked. */ static inline void vm_page_remque(vm_page_t m) { if (m->queue != PQ_NONE) vm_page_dequeue(m); } /* * vm_page_undirty: * * Set page to not be dirty. Note: does not clear pmap modify bits */ static __inline void vm_page_undirty(vm_page_t m) { VM_PAGE_OBJECT_LOCK_ASSERT(m); m->dirty = 0; } static inline void vm_page_replace_checked(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex, vm_page_t mold) { vm_page_t mret; mret = vm_page_replace(mnew, object, pindex); KASSERT(mret == mold, ("invalid page replacement, mold=%p, mret=%p", mold, mret)); /* Unused if !INVARIANTS. */ (void)mold; (void)mret; } #endif /* _KERNEL */ #endif /* !_VM_PAGE_ */ Index: head/sys/vm/vm_pageout.c =================================================================== --- head/sys/vm/vm_pageout.c (revision 292468) +++ head/sys/vm/vm_pageout.c (revision 292469) @@ -1,2016 +1,1850 @@ /*- * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * Copyright (c) 2005 Yahoo! Technologies Norway AS * All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_pageout.c 7.4 (Berkeley) 5/7/91 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * The proverbial page-out daemon. */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * System initialization */ /* the kernel process "vm_pageout"*/ static void vm_pageout(void); static void vm_pageout_init(void); static int vm_pageout_clean(vm_page_t m); static int vm_pageout_cluster(vm_page_t m); static void vm_pageout_scan(struct vm_domain *vmd, int pass); static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage, int starting_page_shortage); SYSINIT(pagedaemon_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, vm_pageout_init, NULL); struct proc *pageproc; static struct kproc_desc page_kp = { "pagedaemon", vm_pageout, &pageproc }; SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_SECOND, kproc_start, &page_kp); SDT_PROVIDER_DEFINE(vm); SDT_PROBE_DEFINE(vm, , , vm__lowmem_cache); SDT_PROBE_DEFINE(vm, , , vm__lowmem_scan); #if !defined(NO_SWAPPING) /* the kernel process "vm_daemon"*/ static void vm_daemon(void); static struct proc *vmproc; static struct kproc_desc vm_kp = { "vmdaemon", vm_daemon, &vmproc }; SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp); #endif int vm_pages_needed; /* Event on which pageout daemon sleeps */ int vm_pageout_deficit; /* Estimated number of pages deficit */ int vm_pageout_wakeup_thresh; static int vm_pageout_oom_seq = 12; #if !defined(NO_SWAPPING) static int vm_pageout_req_swapout; /* XXX */ static int vm_daemon_needed; static struct mtx vm_daemon_mtx; /* Allow for use by vm_pageout before vm_daemon is initialized. */ MTX_SYSINIT(vm_daemon, &vm_daemon_mtx, "vm daemon", MTX_DEF); #endif static int vm_max_launder = 32; static int vm_pageout_update_period; static int defer_swap_pageouts; static int disable_swap_pageouts; static int lowmem_period = 10; static time_t lowmem_uptime; #if defined(NO_SWAPPING) static int vm_swap_enabled = 0; static int vm_swap_idle_enabled = 0; #else static int vm_swap_enabled = 1; static int vm_swap_idle_enabled = 0; #endif static int vm_panic_on_oom = 0; SYSCTL_INT(_vm, OID_AUTO, panic_on_oom, CTLFLAG_RWTUN, &vm_panic_on_oom, 0, "panic on out of memory instead of killing the largest process"); SYSCTL_INT(_vm, OID_AUTO, pageout_wakeup_thresh, CTLFLAG_RW, &vm_pageout_wakeup_thresh, 0, "free page threshold for waking up the pageout daemon"); SYSCTL_INT(_vm, OID_AUTO, max_launder, CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout"); SYSCTL_INT(_vm, OID_AUTO, pageout_update_period, CTLFLAG_RW, &vm_pageout_update_period, 0, "Maximum active LRU update period"); SYSCTL_INT(_vm, OID_AUTO, lowmem_period, CTLFLAG_RW, &lowmem_period, 0, "Low memory callback period"); #if defined(NO_SWAPPING) SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, CTLFLAG_RD, &vm_swap_enabled, 0, "Enable entire process swapout"); SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, CTLFLAG_RD, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria"); #else SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout"); SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria"); #endif SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts, CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem"); SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts, CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages"); static int pageout_lock_miss; SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss, CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout"); SYSCTL_INT(_vm, OID_AUTO, pageout_oom_seq, CTLFLAG_RW, &vm_pageout_oom_seq, 0, "back-to-back calls to oom detector to start OOM"); #define VM_PAGEOUT_PAGE_COUNT 16 int vm_pageout_page_count = VM_PAGEOUT_PAGE_COUNT; int vm_page_max_wired; /* XXX max # of wired pages system-wide */ SYSCTL_INT(_vm, OID_AUTO, max_wired, CTLFLAG_RW, &vm_page_max_wired, 0, "System-wide limit to wired page count"); static boolean_t vm_pageout_fallback_object_lock(vm_page_t, vm_page_t *); -static boolean_t vm_pageout_launder(struct vm_pagequeue *pq, int, vm_paddr_t, - vm_paddr_t); #if !defined(NO_SWAPPING) static void vm_pageout_map_deactivate_pages(vm_map_t, long); static void vm_pageout_object_deactivate_pages(pmap_t, vm_object_t, long); static void vm_req_vmdaemon(int req); #endif static boolean_t vm_pageout_page_lock(vm_page_t, vm_page_t *); /* * Initialize a dummy page for marking the caller's place in the specified * paging queue. In principle, this function only needs to set the flag * PG_MARKER. Nonetheless, it wirte busies and initializes the hold count * to one as safety precautions. */ static void vm_pageout_init_marker(vm_page_t marker, u_short queue) { bzero(marker, sizeof(*marker)); marker->flags = PG_MARKER; marker->busy_lock = VPB_SINGLE_EXCLUSIVER; marker->queue = queue; marker->hold_count = 1; } /* * vm_pageout_fallback_object_lock: * * Lock vm object currently associated with `m'. VM_OBJECT_TRYWLOCK is * known to have failed and page queue must be either PQ_ACTIVE or * PQ_INACTIVE. To avoid lock order violation, unlock the page queues * while locking the vm object. Use marker page to detect page queue * changes and maintain notion of next page on page queue. Return * TRUE if no changes were detected, FALSE otherwise. vm object is * locked on return. * * This function depends on both the lock portion of struct vm_object * and normal struct vm_page being type stable. */ static boolean_t vm_pageout_fallback_object_lock(vm_page_t m, vm_page_t *next) { struct vm_page marker; struct vm_pagequeue *pq; boolean_t unchanged; u_short queue; vm_object_t object; queue = m->queue; vm_pageout_init_marker(&marker, queue); pq = vm_page_pagequeue(m); object = m->object; TAILQ_INSERT_AFTER(&pq->pq_pl, m, &marker, plinks.q); vm_pagequeue_unlock(pq); vm_page_unlock(m); VM_OBJECT_WLOCK(object); vm_page_lock(m); vm_pagequeue_lock(pq); /* * The page's object might have changed, and/or the page might * have moved from its original position in the queue. If the * page's object has changed, then the caller should abandon * processing the page because the wrong object lock was * acquired. Use the marker's plinks.q, not the page's, to * determine if the page has been moved. The state of the * page's plinks.q can be indeterminate; whereas, the marker's * plinks.q must be valid. */ *next = TAILQ_NEXT(&marker, plinks.q); unchanged = m->object == object && m == TAILQ_PREV(&marker, pglist, plinks.q); KASSERT(!unchanged || m->queue == queue, ("page %p queue %d %d", m, queue, m->queue)); TAILQ_REMOVE(&pq->pq_pl, &marker, plinks.q); return (unchanged); } /* * Lock the page while holding the page queue lock. Use marker page * to detect page queue changes and maintain notion of next page on * page queue. Return TRUE if no changes were detected, FALSE * otherwise. The page is locked on return. The page queue lock might * be dropped and reacquired. * * This function depends on normal struct vm_page being type stable. */ static boolean_t vm_pageout_page_lock(vm_page_t m, vm_page_t *next) { struct vm_page marker; struct vm_pagequeue *pq; boolean_t unchanged; u_short queue; vm_page_lock_assert(m, MA_NOTOWNED); if (vm_page_trylock(m)) return (TRUE); queue = m->queue; vm_pageout_init_marker(&marker, queue); pq = vm_page_pagequeue(m); TAILQ_INSERT_AFTER(&pq->pq_pl, m, &marker, plinks.q); vm_pagequeue_unlock(pq); vm_page_lock(m); vm_pagequeue_lock(pq); /* Page queue might have changed. */ *next = TAILQ_NEXT(&marker, plinks.q); unchanged = m == TAILQ_PREV(&marker, pglist, plinks.q); KASSERT(!unchanged || m->queue == queue, ("page %p queue %d %d", m, queue, m->queue)); TAILQ_REMOVE(&pq->pq_pl, &marker, plinks.q); return (unchanged); } /* * vm_pageout_clean: * * Clean the page and remove it from the laundry. * * We set the busy bit to cause potential page faults on this page to * block. Note the careful timing, however, the busy bit isn't set till * late and we cannot do anything that will mess with the page. */ static int vm_pageout_cluster(vm_page_t m) { vm_object_t object; vm_page_t mc[2*vm_pageout_page_count], pb, ps; int pageout_count; int ib, is, page_base; vm_pindex_t pindex = m->pindex; vm_page_lock_assert(m, MA_OWNED); object = m->object; VM_OBJECT_ASSERT_WLOCKED(object); /* * It doesn't cost us anything to pageout OBJT_DEFAULT or OBJT_SWAP * with the new swapper, but we could have serious problems paging * out other object types if there is insufficient memory. * * Unfortunately, checking free memory here is far too late, so the * check has been moved up a procedural level. */ /* * Can't clean the page if it's busy or held. */ vm_page_assert_unbusied(m); KASSERT(m->hold_count == 0, ("vm_pageout_clean: page %p is held", m)); vm_page_unlock(m); mc[vm_pageout_page_count] = pb = ps = m; pageout_count = 1; page_base = vm_pageout_page_count; ib = 1; is = 1; /* * Scan object for clusterable pages. * * We can cluster ONLY if: ->> the page is NOT * clean, wired, busy, held, or mapped into a * buffer, and one of the following: * 1) The page is inactive, or a seldom used * active page. * -or- * 2) we force the issue. * * During heavy mmap/modification loads the pageout * daemon can really fragment the underlying file * due to flushing pages out of order and not trying * align the clusters (which leave sporatic out-of-order * holes). To solve this problem we do the reverse scan * first and attempt to align our cluster, then do a * forward scan if room remains. */ more: while (ib && pageout_count < vm_pageout_page_count) { vm_page_t p; if (ib > pindex) { ib = 0; break; } if ((p = vm_page_prev(pb)) == NULL || vm_page_busied(p)) { ib = 0; break; } vm_page_test_dirty(p); if (p->dirty == 0) { ib = 0; break; } vm_page_lock(p); if (p->queue != PQ_INACTIVE || p->hold_count != 0) { /* may be undergoing I/O */ vm_page_unlock(p); ib = 0; break; } vm_page_unlock(p); mc[--page_base] = pb = p; ++pageout_count; ++ib; /* * alignment boundry, stop here and switch directions. Do * not clear ib. */ if ((pindex - (ib - 1)) % vm_pageout_page_count == 0) break; } while (pageout_count < vm_pageout_page_count && pindex + is < object->size) { vm_page_t p; if ((p = vm_page_next(ps)) == NULL || vm_page_busied(p)) break; vm_page_test_dirty(p); if (p->dirty == 0) break; vm_page_lock(p); if (p->queue != PQ_INACTIVE || p->hold_count != 0) { /* may be undergoing I/O */ vm_page_unlock(p); break; } vm_page_unlock(p); mc[page_base + pageout_count] = ps = p; ++pageout_count; ++is; } /* * If we exhausted our forward scan, continue with the reverse scan * when possible, even past a page boundry. This catches boundry * conditions. */ if (ib && pageout_count < vm_pageout_page_count) goto more; /* * we allow reads during pageouts... */ return (vm_pageout_flush(&mc[page_base], pageout_count, 0, 0, NULL, NULL)); } /* * vm_pageout_flush() - launder the given pages * * The given pages are laundered. Note that we setup for the start of * I/O ( i.e. busy the page ), mark it read-only, and bump the object * reference count all in here rather then in the parent. If we want * the parent to do more sophisticated things we may have to change * the ordering. * * Returned runlen is the count of pages between mreq and first * page after mreq with status VM_PAGER_AGAIN. * *eio is set to TRUE if pager returned VM_PAGER_ERROR or VM_PAGER_FAIL * for any page in runlen set. */ int vm_pageout_flush(vm_page_t *mc, int count, int flags, int mreq, int *prunlen, boolean_t *eio) { vm_object_t object = mc[0]->object; int pageout_status[count]; int numpagedout = 0; int i, runlen; VM_OBJECT_ASSERT_WLOCKED(object); /* * Initiate I/O. Bump the vm_page_t->busy counter and * mark the pages read-only. * * We do not have to fixup the clean/dirty bits here... we can * allow the pager to do it after the I/O completes. * * NOTE! mc[i]->dirty may be partial or fragmented due to an * edge case with file fragments. */ for (i = 0; i < count; i++) { KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL, ("vm_pageout_flush: partially invalid page %p index %d/%d", mc[i], i, count)); vm_page_sbusy(mc[i]); pmap_remove_write(mc[i]); } vm_object_pip_add(object, count); vm_pager_put_pages(object, mc, count, flags, pageout_status); runlen = count - mreq; if (eio != NULL) *eio = FALSE; for (i = 0; i < count; i++) { vm_page_t mt = mc[i]; KASSERT(pageout_status[i] == VM_PAGER_PEND || !pmap_page_is_write_mapped(mt), ("vm_pageout_flush: page %p is not write protected", mt)); switch (pageout_status[i]) { case VM_PAGER_OK: case VM_PAGER_PEND: numpagedout++; break; case VM_PAGER_BAD: /* * Page outside of range of object. Right now we * essentially lose the changes by pretending it * worked. */ vm_page_undirty(mt); break; case VM_PAGER_ERROR: case VM_PAGER_FAIL: /* * If page couldn't be paged out, then reactivate the * page so it doesn't clog the inactive list. (We * will try paging out it again later). */ vm_page_lock(mt); vm_page_activate(mt); vm_page_unlock(mt); if (eio != NULL && i >= mreq && i - mreq < runlen) *eio = TRUE; break; case VM_PAGER_AGAIN: if (i >= mreq && i - mreq < runlen) runlen = i - mreq; break; } /* * If the operation is still going, leave the page busy to * block all other accesses. Also, leave the paging in * progress indicator set so that we don't attempt an object * collapse. */ if (pageout_status[i] != VM_PAGER_PEND) { vm_object_pip_wakeup(object); vm_page_sunbusy(mt); } } if (prunlen != NULL) *prunlen = runlen; return (numpagedout); -} - -static boolean_t -vm_pageout_launder(struct vm_pagequeue *pq, int tries, vm_paddr_t low, - vm_paddr_t high) -{ - struct mount *mp; - struct vnode *vp; - vm_object_t object; - vm_paddr_t pa; - vm_page_t m, m_tmp, next; - int lockmode; - - vm_pagequeue_lock(pq); - TAILQ_FOREACH_SAFE(m, &pq->pq_pl, plinks.q, next) { - if ((m->flags & PG_MARKER) != 0) - continue; - pa = VM_PAGE_TO_PHYS(m); - if (pa < low || pa + PAGE_SIZE > high) - continue; - if (!vm_pageout_page_lock(m, &next) || m->hold_count != 0) { - vm_page_unlock(m); - continue; - } - object = m->object; - if ((!VM_OBJECT_TRYWLOCK(object) && - (!vm_pageout_fallback_object_lock(m, &next) || - m->hold_count != 0)) || vm_page_busied(m)) { - vm_page_unlock(m); - VM_OBJECT_WUNLOCK(object); - continue; - } - vm_page_test_dirty(m); - if (m->dirty == 0 && object->ref_count != 0) - pmap_remove_all(m); - if (m->dirty != 0) { - vm_page_unlock(m); - if (tries == 0 || (object->flags & OBJ_DEAD) != 0) { - VM_OBJECT_WUNLOCK(object); - continue; - } - if (object->type == OBJT_VNODE) { - vm_pagequeue_unlock(pq); - vp = object->handle; - vm_object_reference_locked(object); - VM_OBJECT_WUNLOCK(object); - (void)vn_start_write(vp, &mp, V_WAIT); - lockmode = MNT_SHARED_WRITES(vp->v_mount) ? - LK_SHARED : LK_EXCLUSIVE; - vn_lock(vp, lockmode | LK_RETRY); - VM_OBJECT_WLOCK(object); - vm_object_page_clean(object, 0, 0, OBJPC_SYNC); - VM_OBJECT_WUNLOCK(object); - VOP_UNLOCK(vp, 0); - vm_object_deallocate(object); - vn_finished_write(mp); - return (TRUE); - } else if (object->type == OBJT_SWAP || - object->type == OBJT_DEFAULT) { - vm_pagequeue_unlock(pq); - m_tmp = m; - vm_pageout_flush(&m_tmp, 1, VM_PAGER_PUT_SYNC, - 0, NULL, NULL); - VM_OBJECT_WUNLOCK(object); - return (TRUE); - } - } else { - /* - * Dequeue here to prevent lock recursion in - * vm_page_cache(). - */ - vm_page_dequeue_locked(m); - vm_page_cache(m); - vm_page_unlock(m); - } - VM_OBJECT_WUNLOCK(object); - } - vm_pagequeue_unlock(pq); - return (FALSE); -} - -/* - * Increase the number of cached pages. The specified value, "tries", - * determines which categories of pages are cached: - * - * 0: All clean, inactive pages within the specified physical address range - * are cached. Will not sleep. - * 1: The vm_lowmem handlers are called. All inactive pages within - * the specified physical address range are cached. May sleep. - * 2: The vm_lowmem handlers are called. All inactive and active pages - * within the specified physical address range are cached. May sleep. - */ -void -vm_pageout_grow_cache(int tries, vm_paddr_t low, vm_paddr_t high) -{ - int actl, actmax, inactl, inactmax, dom, initial_dom; - static int start_dom = 0; - - if (tries > 0) { - /* - * Decrease registered cache sizes. The vm_lowmem handlers - * may acquire locks and/or sleep, so they can only be invoked - * when "tries" is greater than zero. - */ - SDT_PROBE0(vm, , , vm__lowmem_cache); - EVENTHANDLER_INVOKE(vm_lowmem, 0); - - /* - * We do this explicitly after the caches have been drained - * above. - */ - uma_reclaim(); - } - - /* - * Make the next scan start on the next domain. - */ - initial_dom = atomic_fetchadd_int(&start_dom, 1) % vm_ndomains; - - inactl = 0; - inactmax = vm_cnt.v_inactive_count; - actl = 0; - actmax = tries < 2 ? 0 : vm_cnt.v_active_count; - dom = initial_dom; - - /* - * Scan domains in round-robin order, first inactive queues, - * then active. Since domain usually owns large physically - * contiguous chunk of memory, it makes sense to completely - * exhaust one domain before switching to next, while growing - * the pool of contiguous physical pages. - * - * Do not even start launder a domain which cannot contain - * the specified address range, as indicated by segments - * constituting the domain. - */ -again_inact: - if (inactl < inactmax) { - if (vm_phys_domain_intersects(vm_dom[dom].vmd_segs, - low, high) && - vm_pageout_launder(&vm_dom[dom].vmd_pagequeues[PQ_INACTIVE], - tries, low, high)) { - inactl++; - goto again_inact; - } - if (++dom == vm_ndomains) - dom = 0; - if (dom != initial_dom) - goto again_inact; - } -again_act: - if (actl < actmax) { - if (vm_phys_domain_intersects(vm_dom[dom].vmd_segs, - low, high) && - vm_pageout_launder(&vm_dom[dom].vmd_pagequeues[PQ_ACTIVE], - tries, low, high)) { - actl++; - goto again_act; - } - if (++dom == vm_ndomains) - dom = 0; - if (dom != initial_dom) - goto again_act; - } } #if !defined(NO_SWAPPING) /* * vm_pageout_object_deactivate_pages * * Deactivate enough pages to satisfy the inactive target * requirements. * * The object and map must be locked. */ static void vm_pageout_object_deactivate_pages(pmap_t pmap, vm_object_t first_object, long desired) { vm_object_t backing_object, object; vm_page_t p; int act_delta, remove_mode; VM_OBJECT_ASSERT_LOCKED(first_object); if ((first_object->flags & OBJ_FICTITIOUS) != 0) return; for (object = first_object;; object = backing_object) { if (pmap_resident_count(pmap) <= desired) goto unlock_return; VM_OBJECT_ASSERT_LOCKED(object); if ((object->flags & OBJ_UNMANAGED) != 0 || object->paging_in_progress != 0) goto unlock_return; remove_mode = 0; if (object->shadow_count > 1) remove_mode = 1; /* * Scan the object's entire memory queue. */ TAILQ_FOREACH(p, &object->memq, listq) { if (pmap_resident_count(pmap) <= desired) goto unlock_return; if (vm_page_busied(p)) continue; PCPU_INC(cnt.v_pdpages); vm_page_lock(p); if (p->wire_count != 0 || p->hold_count != 0 || !pmap_page_exists_quick(pmap, p)) { vm_page_unlock(p); continue; } act_delta = pmap_ts_referenced(p); if ((p->aflags & PGA_REFERENCED) != 0) { if (act_delta == 0) act_delta = 1; vm_page_aflag_clear(p, PGA_REFERENCED); } if (p->queue != PQ_ACTIVE && act_delta != 0) { vm_page_activate(p); p->act_count += act_delta; } else if (p->queue == PQ_ACTIVE) { if (act_delta == 0) { p->act_count -= min(p->act_count, ACT_DECLINE); if (!remove_mode && p->act_count == 0) { pmap_remove_all(p); vm_page_deactivate(p); } else vm_page_requeue(p); } else { vm_page_activate(p); if (p->act_count < ACT_MAX - ACT_ADVANCE) p->act_count += ACT_ADVANCE; vm_page_requeue(p); } } else if (p->queue == PQ_INACTIVE) pmap_remove_all(p); vm_page_unlock(p); } if ((backing_object = object->backing_object) == NULL) goto unlock_return; VM_OBJECT_RLOCK(backing_object); if (object != first_object) VM_OBJECT_RUNLOCK(object); } unlock_return: if (object != first_object) VM_OBJECT_RUNLOCK(object); } /* * deactivate some number of pages in a map, try to do it fairly, but * that is really hard to do. */ static void vm_pageout_map_deactivate_pages(map, desired) vm_map_t map; long desired; { vm_map_entry_t tmpe; vm_object_t obj, bigobj; int nothingwired; if (!vm_map_trylock(map)) return; bigobj = NULL; nothingwired = TRUE; /* * first, search out the biggest object, and try to free pages from * that. */ tmpe = map->header.next; while (tmpe != &map->header) { if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { obj = tmpe->object.vm_object; if (obj != NULL && VM_OBJECT_TRYRLOCK(obj)) { if (obj->shadow_count <= 1 && (bigobj == NULL || bigobj->resident_page_count < obj->resident_page_count)) { if (bigobj != NULL) VM_OBJECT_RUNLOCK(bigobj); bigobj = obj; } else VM_OBJECT_RUNLOCK(obj); } } if (tmpe->wired_count > 0) nothingwired = FALSE; tmpe = tmpe->next; } if (bigobj != NULL) { vm_pageout_object_deactivate_pages(map->pmap, bigobj, desired); VM_OBJECT_RUNLOCK(bigobj); } /* * Next, hunt around for other pages to deactivate. We actually * do this search sort of wrong -- .text first is not the best idea. */ tmpe = map->header.next; while (tmpe != &map->header) { if (pmap_resident_count(vm_map_pmap(map)) <= desired) break; if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { obj = tmpe->object.vm_object; if (obj != NULL) { VM_OBJECT_RLOCK(obj); vm_pageout_object_deactivate_pages(map->pmap, obj, desired); VM_OBJECT_RUNLOCK(obj); } } tmpe = tmpe->next; } /* * Remove all mappings if a process is swapped out, this will free page * table pages. */ if (desired == 0 && nothingwired) { pmap_remove(vm_map_pmap(map), vm_map_min(map), vm_map_max(map)); } vm_map_unlock(map); } #endif /* !defined(NO_SWAPPING) */ /* * Attempt to acquire all of the necessary locks to launder a page and * then call through the clustering layer to PUTPAGES. Wait a short * time for a vnode lock. * * Requires the page and object lock on entry, releases both before return. * Returns 0 on success and an errno otherwise. */ static int vm_pageout_clean(vm_page_t m) { struct vnode *vp; struct mount *mp; vm_object_t object; vm_pindex_t pindex; int error, lockmode; vm_page_assert_locked(m); object = m->object; VM_OBJECT_ASSERT_WLOCKED(object); error = 0; vp = NULL; mp = NULL; /* * The object is already known NOT to be dead. It * is possible for the vget() to block the whole * pageout daemon, but the new low-memory handling * code should prevent it. * * We can't wait forever for the vnode lock, we might * deadlock due to a vn_read() getting stuck in * vm_wait while holding this vnode. We skip the * vnode if we can't get it in a reasonable amount * of time. */ if (object->type == OBJT_VNODE) { vm_page_unlock(m); vp = object->handle; if (vp->v_type == VREG && vn_start_write(vp, &mp, V_NOWAIT) != 0) { mp = NULL; error = EDEADLK; goto unlock_all; } KASSERT(mp != NULL, ("vp %p with NULL v_mount", vp)); vm_object_reference_locked(object); pindex = m->pindex; VM_OBJECT_WUNLOCK(object); lockmode = MNT_SHARED_WRITES(vp->v_mount) ? LK_SHARED : LK_EXCLUSIVE; if (vget(vp, lockmode | LK_TIMELOCK, curthread)) { vp = NULL; error = EDEADLK; goto unlock_mp; } VM_OBJECT_WLOCK(object); vm_page_lock(m); /* * While the object and page were unlocked, the page * may have been: * (1) moved to a different queue, * (2) reallocated to a different object, * (3) reallocated to a different offset, or * (4) cleaned. */ if (m->queue != PQ_INACTIVE || m->object != object || m->pindex != pindex || m->dirty == 0) { vm_page_unlock(m); error = ENXIO; goto unlock_all; } /* * The page may have been busied or held while the object * and page locks were released. */ if (vm_page_busied(m) || m->hold_count != 0) { vm_page_unlock(m); error = EBUSY; goto unlock_all; } } /* * If a page is dirty, then it is either being washed * (but not yet cleaned) or it is still in the * laundry. If it is still in the laundry, then we * start the cleaning operation. */ if (vm_pageout_cluster(m) == 0) error = EIO; unlock_all: VM_OBJECT_WUNLOCK(object); unlock_mp: vm_page_lock_assert(m, MA_NOTOWNED); if (mp != NULL) { if (vp != NULL) vput(vp); vm_object_deallocate(object); vn_finished_write(mp); } return (error); } /* * vm_pageout_scan does the dirty work for the pageout daemon. * * pass 0 - Update active LRU/deactivate pages * pass 1 - Move inactive to cache or free * pass 2 - Launder dirty pages */ static void vm_pageout_scan(struct vm_domain *vmd, int pass) { vm_page_t m, next; struct vm_pagequeue *pq; vm_object_t object; long min_scan; int act_delta, addl_page_shortage, deficit, error, maxlaunder, maxscan; int page_shortage, scan_tick, scanned, starting_page_shortage; int vnodes_skipped; boolean_t pageout_ok, queues_locked; /* * If we need to reclaim memory ask kernel caches to return * some. We rate limit to avoid thrashing. */ if (vmd == &vm_dom[0] && pass > 0 && (time_uptime - lowmem_uptime) >= lowmem_period) { /* * Decrease registered cache sizes. */ SDT_PROBE0(vm, , , vm__lowmem_scan); EVENTHANDLER_INVOKE(vm_lowmem, 0); /* * We do this explicitly after the caches have been * drained above. */ uma_reclaim(); lowmem_uptime = time_uptime; } /* * The addl_page_shortage is the number of temporarily * stuck pages in the inactive queue. In other words, the * number of pages from the inactive count that should be * discounted in setting the target for the active queue scan. */ addl_page_shortage = 0; /* * Calculate the number of pages we want to either free or move * to the cache. */ if (pass > 0) { deficit = atomic_readandclear_int(&vm_pageout_deficit); page_shortage = vm_paging_target() + deficit; } else page_shortage = deficit = 0; starting_page_shortage = page_shortage; /* * maxlaunder limits the number of dirty pages we flush per scan. * For most systems a smaller value (16 or 32) is more robust under * extreme memory and disk pressure because any unnecessary writes * to disk can result in extreme performance degredation. However, * systems with excessive dirty pages (especially when MAP_NOSYNC is * used) will die horribly with limited laundering. If the pageout * daemon cannot clean enough pages in the first pass, we let it go * all out in succeeding passes. */ if ((maxlaunder = vm_max_launder) <= 1) maxlaunder = 1; if (pass > 1) maxlaunder = 10000; vnodes_skipped = 0; /* * Start scanning the inactive queue for pages we can move to the * cache or free. The scan will stop when the target is reached or * we have scanned the entire inactive queue. Note that m->act_count * is not used to form decisions for the inactive queue, only for the * active queue. */ pq = &vmd->vmd_pagequeues[PQ_INACTIVE]; maxscan = pq->pq_cnt; vm_pagequeue_lock(pq); queues_locked = TRUE; for (m = TAILQ_FIRST(&pq->pq_pl); m != NULL && maxscan-- > 0 && page_shortage > 0; m = next) { vm_pagequeue_assert_locked(pq); KASSERT(queues_locked, ("unlocked queues")); KASSERT(m->queue == PQ_INACTIVE, ("Inactive queue %p", m)); PCPU_INC(cnt.v_pdpages); next = TAILQ_NEXT(m, plinks.q); /* * skip marker pages */ if (m->flags & PG_MARKER) continue; KASSERT((m->flags & PG_FICTITIOUS) == 0, ("Fictitious page %p cannot be in inactive queue", m)); KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("Unmanaged page %p cannot be in inactive queue", m)); /* * The page or object lock acquisitions fail if the * page was removed from the queue or moved to a * different position within the queue. In either * case, addl_page_shortage should not be incremented. */ if (!vm_pageout_page_lock(m, &next)) goto unlock_page; else if (m->hold_count != 0) { /* * Held pages are essentially stuck in the * queue. So, they ought to be discounted * from the inactive count. See the * calculation of the page_shortage for the * loop over the active queue below. */ addl_page_shortage++; goto unlock_page; } object = m->object; if (!VM_OBJECT_TRYWLOCK(object)) { if (!vm_pageout_fallback_object_lock(m, &next)) goto unlock_object; else if (m->hold_count != 0) { addl_page_shortage++; goto unlock_object; } } if (vm_page_busied(m)) { /* * Don't mess with busy pages. Leave them at * the front of the queue. Most likely, they * are being paged out and will leave the * queue shortly after the scan finishes. So, * they ought to be discounted from the * inactive count. */ addl_page_shortage++; unlock_object: VM_OBJECT_WUNLOCK(object); unlock_page: vm_page_unlock(m); continue; } KASSERT(m->hold_count == 0, ("Held page %p", m)); /* * We unlock the inactive page queue, invalidating the * 'next' pointer. Use our marker to remember our * place. */ TAILQ_INSERT_AFTER(&pq->pq_pl, m, &vmd->vmd_marker, plinks.q); vm_pagequeue_unlock(pq); queues_locked = FALSE; /* * Invalid pages can be easily freed. They cannot be * mapped, vm_page_free() asserts this. */ if (m->valid == 0) goto free_page; /* * If the page has been referenced and the object is not dead, * reactivate or requeue the page depending on whether the * object is mapped. */ if ((m->aflags & PGA_REFERENCED) != 0) { vm_page_aflag_clear(m, PGA_REFERENCED); act_delta = 1; } else act_delta = 0; if (object->ref_count != 0) { act_delta += pmap_ts_referenced(m); } else { KASSERT(!pmap_page_is_mapped(m), ("vm_pageout_scan: page %p is mapped", m)); } if (act_delta != 0) { if (object->ref_count != 0) { vm_page_activate(m); /* * Increase the activation count if the page * was referenced while in the inactive queue. * This makes it less likely that the page will * be returned prematurely to the inactive * queue. */ m->act_count += act_delta + ACT_ADVANCE; goto drop_page; } else if ((object->flags & OBJ_DEAD) == 0) goto requeue_page; } /* * If the page appears to be clean at the machine-independent * layer, then remove all of its mappings from the pmap in * anticipation of placing it onto the cache queue. If, * however, any of the page's mappings allow write access, * then the page may still be modified until the last of those * mappings are removed. */ if (object->ref_count != 0) { vm_page_test_dirty(m); if (m->dirty == 0) pmap_remove_all(m); } if (m->dirty == 0) { /* * Clean pages can be freed. */ free_page: vm_page_free(m); PCPU_INC(cnt.v_dfree); --page_shortage; } else if ((object->flags & OBJ_DEAD) != 0) { /* * Leave dirty pages from dead objects at the front of * the queue. They are being paged out and freed by * the thread that destroyed the object. They will * leave the queue shortly after the scan finishes, so * they should be discounted from the inactive count. */ addl_page_shortage++; } else if ((m->flags & PG_WINATCFLS) == 0 && pass < 2) { /* * Dirty pages need to be paged out, but flushing * a page is extremely expensive versus freeing * a clean page. Rather then artificially limiting * the number of pages we can flush, we instead give * dirty pages extra priority on the inactive queue * by forcing them to be cycled through the queue * twice before being flushed, after which the * (now clean) page will cycle through once more * before being freed. This significantly extends * the thrash point for a heavily loaded machine. */ m->flags |= PG_WINATCFLS; requeue_page: vm_pagequeue_lock(pq); queues_locked = TRUE; vm_page_requeue_locked(m); } else if (maxlaunder > 0) { /* * We always want to try to flush some dirty pages if * we encounter them, to keep the system stable. * Normally this number is small, but under extreme * pressure where there are insufficient clean pages * on the inactive queue, we may have to go all out. */ if (object->type != OBJT_SWAP && object->type != OBJT_DEFAULT) pageout_ok = TRUE; else if (disable_swap_pageouts) pageout_ok = FALSE; else if (defer_swap_pageouts) pageout_ok = vm_page_count_min(); else pageout_ok = TRUE; if (!pageout_ok) goto requeue_page; error = vm_pageout_clean(m); /* * Decrement page_shortage on success to account for * the (future) cleaned page. Otherwise we could wind * up laundering or cleaning too many pages. */ if (error == 0) { page_shortage--; maxlaunder--; } else if (error == EDEADLK) { pageout_lock_miss++; vnodes_skipped++; } else if (error == EBUSY) { addl_page_shortage++; } vm_page_lock_assert(m, MA_NOTOWNED); goto relock_queues; } drop_page: vm_page_unlock(m); VM_OBJECT_WUNLOCK(object); relock_queues: if (!queues_locked) { vm_pagequeue_lock(pq); queues_locked = TRUE; } next = TAILQ_NEXT(&vmd->vmd_marker, plinks.q); TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_marker, plinks.q); } vm_pagequeue_unlock(pq); #if !defined(NO_SWAPPING) /* * Wakeup the swapout daemon if we didn't cache or free the targeted * number of pages. */ if (vm_swap_enabled && page_shortage > 0) vm_req_vmdaemon(VM_SWAP_NORMAL); #endif /* * Wakeup the sync daemon if we skipped a vnode in a writeable object * and we didn't cache or free enough pages. */ if (vnodes_skipped > 0 && page_shortage > vm_cnt.v_free_target - vm_cnt.v_free_min) (void)speedup_syncer(); /* * If the inactive queue scan fails repeatedly to meet its * target, kill the largest process. */ vm_pageout_mightbe_oom(vmd, page_shortage, starting_page_shortage); /* * Compute the number of pages we want to try to move from the * active queue to the inactive queue. */ page_shortage = vm_cnt.v_inactive_target - vm_cnt.v_inactive_count + vm_paging_target() + deficit + addl_page_shortage; pq = &vmd->vmd_pagequeues[PQ_ACTIVE]; vm_pagequeue_lock(pq); maxscan = pq->pq_cnt; /* * If we're just idle polling attempt to visit every * active page within 'update_period' seconds. */ scan_tick = ticks; if (vm_pageout_update_period != 0) { min_scan = pq->pq_cnt; min_scan *= scan_tick - vmd->vmd_last_active_scan; min_scan /= hz * vm_pageout_update_period; } else min_scan = 0; if (min_scan > 0 || (page_shortage > 0 && maxscan > 0)) vmd->vmd_last_active_scan = scan_tick; /* * Scan the active queue for pages that can be deactivated. Update * the per-page activity counter and use it to identify deactivation * candidates. */ for (m = TAILQ_FIRST(&pq->pq_pl), scanned = 0; m != NULL && (scanned < min_scan || (page_shortage > 0 && scanned < maxscan)); m = next, scanned++) { KASSERT(m->queue == PQ_ACTIVE, ("vm_pageout_scan: page %p isn't active", m)); next = TAILQ_NEXT(m, plinks.q); if ((m->flags & PG_MARKER) != 0) continue; KASSERT((m->flags & PG_FICTITIOUS) == 0, ("Fictitious page %p cannot be in active queue", m)); KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("Unmanaged page %p cannot be in active queue", m)); if (!vm_pageout_page_lock(m, &next)) { vm_page_unlock(m); continue; } /* * The count for pagedaemon pages is done after checking the * page for eligibility... */ PCPU_INC(cnt.v_pdpages); /* * Check to see "how much" the page has been used. */ if ((m->aflags & PGA_REFERENCED) != 0) { vm_page_aflag_clear(m, PGA_REFERENCED); act_delta = 1; } else act_delta = 0; /* * Unlocked object ref count check. Two races are possible. * 1) The ref was transitioning to zero and we saw non-zero, * the pmap bits will be checked unnecessarily. * 2) The ref was transitioning to one and we saw zero. * The page lock prevents a new reference to this page so * we need not check the reference bits. */ if (m->object->ref_count != 0) act_delta += pmap_ts_referenced(m); /* * Advance or decay the act_count based on recent usage. */ if (act_delta != 0) { m->act_count += ACT_ADVANCE + act_delta; if (m->act_count > ACT_MAX) m->act_count = ACT_MAX; } else m->act_count -= min(m->act_count, ACT_DECLINE); /* * Move this page to the tail of the active or inactive * queue depending on usage. */ if (m->act_count == 0) { /* Dequeue to avoid later lock recursion. */ vm_page_dequeue_locked(m); vm_page_deactivate(m); page_shortage--; } else vm_page_requeue_locked(m); vm_page_unlock(m); } vm_pagequeue_unlock(pq); #if !defined(NO_SWAPPING) /* * Idle process swapout -- run once per second. */ if (vm_swap_idle_enabled) { static long lsec; if (time_second != lsec) { vm_req_vmdaemon(VM_SWAP_IDLE); lsec = time_second; } } #endif } static int vm_pageout_oom_vote; /* * The pagedaemon threads randlomly select one to perform the * OOM. Trying to kill processes before all pagedaemons * failed to reach free target is premature. */ static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage, int starting_page_shortage) { int old_vote; if (starting_page_shortage <= 0 || starting_page_shortage != page_shortage) vmd->vmd_oom_seq = 0; else vmd->vmd_oom_seq++; if (vmd->vmd_oom_seq < vm_pageout_oom_seq) { if (vmd->vmd_oom) { vmd->vmd_oom = FALSE; atomic_subtract_int(&vm_pageout_oom_vote, 1); } return; } /* * Do not follow the call sequence until OOM condition is * cleared. */ vmd->vmd_oom_seq = 0; if (vmd->vmd_oom) return; vmd->vmd_oom = TRUE; old_vote = atomic_fetchadd_int(&vm_pageout_oom_vote, 1); if (old_vote != vm_ndomains - 1) return; /* * The current pagedaemon thread is the last in the quorum to * start OOM. Initiate the selection and signaling of the * victim. */ vm_pageout_oom(VM_OOM_MEM); /* * After one round of OOM terror, recall our vote. On the * next pass, current pagedaemon would vote again if the low * memory condition is still there, due to vmd_oom being * false. */ vmd->vmd_oom = FALSE; atomic_subtract_int(&vm_pageout_oom_vote, 1); } /* * The OOM killer is the page daemon's action of last resort when * memory allocation requests have been stalled for a prolonged period * of time because it cannot reclaim memory. This function computes * the approximate number of physical pages that could be reclaimed if * the specified address space is destroyed. * * Private, anonymous memory owned by the address space is the * principal resource that we expect to recover after an OOM kill. * Since the physical pages mapped by the address space's COW entries * are typically shared pages, they are unlikely to be released and so * they are not counted. * * To get to the point where the page daemon runs the OOM killer, its * efforts to write-back vnode-backed pages may have stalled. This * could be caused by a memory allocation deadlock in the write path * that might be resolved by an OOM kill. Therefore, physical pages * belonging to vnode-backed objects are counted, because they might * be freed without being written out first if the address space holds * the last reference to an unlinked vnode. * * Similarly, physical pages belonging to OBJT_PHYS objects are * counted because the address space might hold the last reference to * the object. */ static long vm_pageout_oom_pagecount(struct vmspace *vmspace) { vm_map_t map; vm_map_entry_t entry; vm_object_t obj; long res; map = &vmspace->vm_map; KASSERT(!map->system_map, ("system map")); sx_assert(&map->lock, SA_LOCKED); res = 0; for (entry = map->header.next; entry != &map->header; entry = entry->next) { if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) continue; obj = entry->object.vm_object; if (obj == NULL) continue; if ((entry->eflags & MAP_ENTRY_NEEDS_COPY) != 0 && obj->ref_count != 1) continue; switch (obj->type) { case OBJT_DEFAULT: case OBJT_SWAP: case OBJT_PHYS: case OBJT_VNODE: res += obj->resident_page_count; break; } } return (res); } void vm_pageout_oom(int shortage) { struct proc *p, *bigproc; vm_offset_t size, bigsize; struct thread *td; struct vmspace *vm; /* * We keep the process bigproc locked once we find it to keep anyone * from messing with it; however, there is a possibility of * deadlock if process B is bigproc and one of it's child processes * attempts to propagate a signal to B while we are waiting for A's * lock while walking this list. To avoid this, we don't block on * the process lock but just skip a process if it is already locked. */ bigproc = NULL; bigsize = 0; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { int breakout; PROC_LOCK(p); /* * If this is a system, protected or killed process, skip it. */ if (p->p_state != PRS_NORMAL || (p->p_flag & (P_INEXEC | P_PROTECTED | P_SYSTEM | P_WEXIT)) != 0 || p->p_pid == 1 || P_KILLED(p) || (p->p_pid < 48 && swap_pager_avail != 0)) { PROC_UNLOCK(p); continue; } /* * If the process is in a non-running type state, * don't touch it. Check all the threads individually. */ breakout = 0; FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); if (!TD_ON_RUNQ(td) && !TD_IS_RUNNING(td) && !TD_IS_SLEEPING(td) && !TD_IS_SUSPENDED(td) && !TD_IS_SWAPPED(td)) { thread_unlock(td); breakout = 1; break; } thread_unlock(td); } if (breakout) { PROC_UNLOCK(p); continue; } /* * get the process size */ vm = vmspace_acquire_ref(p); if (vm == NULL) { PROC_UNLOCK(p); continue; } _PHOLD(p); if (!vm_map_trylock_read(&vm->vm_map)) { _PRELE(p); PROC_UNLOCK(p); vmspace_free(vm); continue; } PROC_UNLOCK(p); size = vmspace_swap_count(vm); if (shortage == VM_OOM_MEM) size += vm_pageout_oom_pagecount(vm); vm_map_unlock_read(&vm->vm_map); vmspace_free(vm); /* * If this process is bigger than the biggest one, * remember it. */ if (size > bigsize) { if (bigproc != NULL) PRELE(bigproc); bigproc = p; bigsize = size; } else { PRELE(p); } } sx_sunlock(&allproc_lock); if (bigproc != NULL) { if (vm_panic_on_oom != 0) panic("out of swap space"); PROC_LOCK(bigproc); killproc(bigproc, "out of swap space"); sched_nice(bigproc, PRIO_MIN); _PRELE(bigproc); PROC_UNLOCK(bigproc); wakeup(&vm_cnt.v_free_count); } } static void vm_pageout_worker(void *arg) { struct vm_domain *domain; int domidx; domidx = (uintptr_t)arg; domain = &vm_dom[domidx]; /* * XXXKIB It could be useful to bind pageout daemon threads to * the cores belonging to the domain, from which vm_page_array * is allocated. */ KASSERT(domain->vmd_segs != 0, ("domain without segments")); domain->vmd_last_active_scan = ticks; vm_pageout_init_marker(&domain->vmd_marker, PQ_INACTIVE); vm_pageout_init_marker(&domain->vmd_inacthead, PQ_INACTIVE); TAILQ_INSERT_HEAD(&domain->vmd_pagequeues[PQ_INACTIVE].pq_pl, &domain->vmd_inacthead, plinks.q); /* * The pageout daemon worker is never done, so loop forever. */ while (TRUE) { /* * If we have enough free memory, wakeup waiters. Do * not clear vm_pages_needed until we reach our target, * otherwise we may be woken up over and over again and * waste a lot of cpu. */ mtx_lock(&vm_page_queue_free_mtx); if (vm_pages_needed && !vm_page_count_min()) { if (!vm_paging_needed()) vm_pages_needed = 0; wakeup(&vm_cnt.v_free_count); } if (vm_pages_needed) { /* * We're still not done. Either vm_pages_needed was * set by another thread during the previous scan * (typically, this happens during a level 0 scan) or * vm_pages_needed was already set and the scan failed * to free enough pages. If we haven't yet performed * a level >= 2 scan (unlimited dirty cleaning), then * upgrade the level and scan again now. Otherwise, * sleep a bit and try again later. While sleeping, * vm_pages_needed can be cleared. */ if (domain->vmd_pass > 1) msleep(&vm_pages_needed, &vm_page_queue_free_mtx, PVM, "psleep", hz / 2); } else { /* * Good enough, sleep until required to refresh * stats. */ msleep(&vm_pages_needed, &vm_page_queue_free_mtx, PVM, "psleep", hz); } if (vm_pages_needed) { vm_cnt.v_pdwakeups++; domain->vmd_pass++; } else domain->vmd_pass = 0; mtx_unlock(&vm_page_queue_free_mtx); vm_pageout_scan(domain, domain->vmd_pass); } } /* * vm_pageout_init initialises basic pageout daemon settings. */ static void vm_pageout_init(void) { /* * Initialize some paging parameters. */ vm_cnt.v_interrupt_free_min = 2; if (vm_cnt.v_page_count < 2000) vm_pageout_page_count = 8; /* * v_free_reserved needs to include enough for the largest * swap pager structures plus enough for any pv_entry structs * when paging. */ if (vm_cnt.v_page_count > 1024) vm_cnt.v_free_min = 4 + (vm_cnt.v_page_count - 1024) / 200; else vm_cnt.v_free_min = 4; vm_cnt.v_pageout_free_min = (2*MAXBSIZE)/PAGE_SIZE + vm_cnt.v_interrupt_free_min; vm_cnt.v_free_reserved = vm_pageout_page_count + vm_cnt.v_pageout_free_min + (vm_cnt.v_page_count / 768); vm_cnt.v_free_severe = vm_cnt.v_free_min / 2; vm_cnt.v_free_target = 4 * vm_cnt.v_free_min + vm_cnt.v_free_reserved; vm_cnt.v_free_min += vm_cnt.v_free_reserved; vm_cnt.v_free_severe += vm_cnt.v_free_reserved; vm_cnt.v_inactive_target = (3 * vm_cnt.v_free_target) / 2; if (vm_cnt.v_inactive_target > vm_cnt.v_free_count / 3) vm_cnt.v_inactive_target = vm_cnt.v_free_count / 3; /* * Set the default wakeup threshold to be 10% above the minimum * page limit. This keeps the steady state out of shortfall. */ vm_pageout_wakeup_thresh = (vm_cnt.v_free_min / 10) * 11; /* * Set interval in seconds for active scan. We want to visit each * page at least once every ten minutes. This is to prevent worst * case paging behaviors with stale active LRU. */ if (vm_pageout_update_period == 0) vm_pageout_update_period = 600; /* XXX does not really belong here */ if (vm_page_max_wired == 0) vm_page_max_wired = vm_cnt.v_free_count / 3; } /* * vm_pageout is the high level pageout daemon. */ static void vm_pageout(void) { int error; #if MAXMEMDOM > 1 int i; #endif swap_pager_swap_init(); #if MAXMEMDOM > 1 for (i = 1; i < vm_ndomains; i++) { error = kthread_add(vm_pageout_worker, (void *)(uintptr_t)i, curproc, NULL, 0, 0, "dom%d", i); if (error != 0) { panic("starting pageout for domain %d, error %d\n", i, error); } } #endif error = kthread_add(uma_reclaim_worker, NULL, curproc, NULL, 0, 0, "uma"); if (error != 0) panic("starting uma_reclaim helper, error %d\n", error); vm_pageout_worker((void *)(uintptr_t)0); } /* * Unless the free page queue lock is held by the caller, this function * should be regarded as advisory. Specifically, the caller should * not msleep() on &vm_cnt.v_free_count following this function unless * the free page queue lock is held until the msleep() is performed. */ void pagedaemon_wakeup(void) { if (!vm_pages_needed && curthread->td_proc != pageproc) { vm_pages_needed = 1; wakeup(&vm_pages_needed); } } #if !defined(NO_SWAPPING) static void vm_req_vmdaemon(int req) { static int lastrun = 0; mtx_lock(&vm_daemon_mtx); vm_pageout_req_swapout |= req; if ((ticks > (lastrun + hz)) || (ticks < lastrun)) { wakeup(&vm_daemon_needed); lastrun = ticks; } mtx_unlock(&vm_daemon_mtx); } static void vm_daemon(void) { struct rlimit rsslim; struct proc *p; struct thread *td; struct vmspace *vm; int breakout, swapout_flags, tryagain, attempts; #ifdef RACCT uint64_t rsize, ravailable; #endif while (TRUE) { mtx_lock(&vm_daemon_mtx); msleep(&vm_daemon_needed, &vm_daemon_mtx, PPAUSE, "psleep", #ifdef RACCT racct_enable ? hz : 0 #else 0 #endif ); swapout_flags = vm_pageout_req_swapout; vm_pageout_req_swapout = 0; mtx_unlock(&vm_daemon_mtx); if (swapout_flags) swapout_procs(swapout_flags); /* * scan the processes for exceeding their rlimits or if * process is swapped out -- deactivate pages */ tryagain = 0; attempts = 0; again: attempts++; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { vm_pindex_t limit, size; /* * if this is a system process or if we have already * looked at this process, skip it. */ PROC_LOCK(p); if (p->p_state != PRS_NORMAL || p->p_flag & (P_INEXEC | P_SYSTEM | P_WEXIT)) { PROC_UNLOCK(p); continue; } /* * if the process is in a non-running type state, * don't touch it. */ breakout = 0; FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); if (!TD_ON_RUNQ(td) && !TD_IS_RUNNING(td) && !TD_IS_SLEEPING(td) && !TD_IS_SUSPENDED(td)) { thread_unlock(td); breakout = 1; break; } thread_unlock(td); } if (breakout) { PROC_UNLOCK(p); continue; } /* * get a limit */ lim_rlimit_proc(p, RLIMIT_RSS, &rsslim); limit = OFF_TO_IDX( qmin(rsslim.rlim_cur, rsslim.rlim_max)); /* * let processes that are swapped out really be * swapped out set the limit to nothing (will force a * swap-out.) */ if ((p->p_flag & P_INMEM) == 0) limit = 0; /* XXX */ vm = vmspace_acquire_ref(p); PROC_UNLOCK(p); if (vm == NULL) continue; size = vmspace_resident_count(vm); if (size >= limit) { vm_pageout_map_deactivate_pages( &vm->vm_map, limit); } #ifdef RACCT if (racct_enable) { rsize = IDX_TO_OFF(size); PROC_LOCK(p); racct_set(p, RACCT_RSS, rsize); ravailable = racct_get_available(p, RACCT_RSS); PROC_UNLOCK(p); if (rsize > ravailable) { /* * Don't be overly aggressive; this * might be an innocent process, * and the limit could've been exceeded * by some memory hog. Don't try * to deactivate more than 1/4th * of process' resident set size. */ if (attempts <= 8) { if (ravailable < rsize - (rsize / 4)) { ravailable = rsize - (rsize / 4); } } vm_pageout_map_deactivate_pages( &vm->vm_map, OFF_TO_IDX(ravailable)); /* Update RSS usage after paging out. */ size = vmspace_resident_count(vm); rsize = IDX_TO_OFF(size); PROC_LOCK(p); racct_set(p, RACCT_RSS, rsize); PROC_UNLOCK(p); if (rsize > ravailable) tryagain = 1; } } #endif vmspace_free(vm); } sx_sunlock(&allproc_lock); if (tryagain != 0 && attempts <= 10) goto again; } } #endif /* !defined(NO_SWAPPING) */ Index: head/sys/vm/vm_pageout.h =================================================================== --- head/sys/vm/vm_pageout.h (revision 292468) +++ head/sys/vm/vm_pageout.h (revision 292469) @@ -1,107 +1,106 @@ /*- * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_pageout.h 8.2 (Berkeley) 1/12/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Author: Avadis Tevanian, Jr. * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * * $FreeBSD$ */ #ifndef _VM_VM_PAGEOUT_H_ #define _VM_VM_PAGEOUT_H_ /* * Header file for pageout daemon. */ /* * Exported data structures. */ extern int vm_page_max_wired; extern int vm_pages_needed; /* should be some "event" structure */ extern int vm_pageout_deficit; extern int vm_pageout_page_count; /* * Swap out requests */ #define VM_SWAP_NORMAL 1 #define VM_SWAP_IDLE 2 #define VM_OOM_MEM 1 #define VM_OOM_SWAPZ 2 /* * Exported routines. */ /* * Signal pageout-daemon and wait for it. */ extern void pagedaemon_wakeup(void); #define VM_WAIT vm_wait() #define VM_WAITPFAULT vm_waitpfault() extern void vm_wait(void); extern void vm_waitpfault(void); #ifdef _KERNEL int vm_pageout_flush(vm_page_t *, int, int, int, int *, boolean_t *); -void vm_pageout_grow_cache(int, vm_paddr_t, vm_paddr_t); void vm_pageout_oom(int shortage); #endif #endif /* _VM_VM_PAGEOUT_H_ */ Index: head/sys/vm/vm_phys.c =================================================================== --- head/sys/vm/vm_phys.c (revision 292468) +++ head/sys/vm/vm_phys.c (revision 292469) @@ -1,1442 +1,1525 @@ /*- * Copyright (c) 2002-2006 Rice University * Copyright (c) 2007 Alan L. Cox * All rights reserved. * * This software was developed for the FreeBSD Project by Alan L. Cox, * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Physical memory system implementation * * Any external functions defined by this module are only to be used by the * virtual memory system. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_vm.h" #include #include #include #include #include #include #if MAXMEMDOM > 1 #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include _Static_assert(sizeof(long) * NBBY >= VM_PHYSSEG_MAX, "Too many physsegs."); struct mem_affinity *mem_affinity; int *mem_locality; int vm_ndomains = 1; struct vm_phys_seg vm_phys_segs[VM_PHYSSEG_MAX]; int vm_phys_nsegs; struct vm_phys_fictitious_seg; static int vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *, struct vm_phys_fictitious_seg *); RB_HEAD(fict_tree, vm_phys_fictitious_seg) vm_phys_fictitious_tree = RB_INITIALIZER(_vm_phys_fictitious_tree); struct vm_phys_fictitious_seg { RB_ENTRY(vm_phys_fictitious_seg) node; /* Memory region data */ vm_paddr_t start; vm_paddr_t end; vm_page_t first_page; }; RB_GENERATE_STATIC(fict_tree, vm_phys_fictitious_seg, node, vm_phys_fictitious_cmp); static struct rwlock vm_phys_fictitious_reg_lock; MALLOC_DEFINE(M_FICT_PAGES, "vm_fictitious", "Fictitious VM pages"); static struct vm_freelist vm_phys_free_queues[MAXMEMDOM][VM_NFREELIST][VM_NFREEPOOL][VM_NFREEORDER]; static int vm_nfreelists; /* * Provides the mapping from VM_FREELIST_* to free list indices (flind). */ static int vm_freelist_to_flind[VM_NFREELIST]; CTASSERT(VM_FREELIST_DEFAULT == 0); #ifdef VM_FREELIST_ISADMA #define VM_ISADMA_BOUNDARY 16777216 #endif #ifdef VM_FREELIST_DMA32 #define VM_DMA32_BOUNDARY ((vm_paddr_t)1 << 32) #endif /* * Enforce the assumptions made by vm_phys_add_seg() and vm_phys_init() about * the ordering of the free list boundaries. */ #if defined(VM_ISADMA_BOUNDARY) && defined(VM_LOWMEM_BOUNDARY) CTASSERT(VM_ISADMA_BOUNDARY < VM_LOWMEM_BOUNDARY); #endif #if defined(VM_LOWMEM_BOUNDARY) && defined(VM_DMA32_BOUNDARY) CTASSERT(VM_LOWMEM_BOUNDARY < VM_DMA32_BOUNDARY); #endif static int cnt_prezero; SYSCTL_INT(_vm_stats_misc, OID_AUTO, cnt_prezero, CTLFLAG_RD, &cnt_prezero, 0, "The number of physical pages prezeroed at idle time"); static int sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS); SYSCTL_OID(_vm, OID_AUTO, phys_free, CTLTYPE_STRING | CTLFLAG_RD, NULL, 0, sysctl_vm_phys_free, "A", "Phys Free Info"); static int sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS); SYSCTL_OID(_vm, OID_AUTO, phys_segs, CTLTYPE_STRING | CTLFLAG_RD, NULL, 0, sysctl_vm_phys_segs, "A", "Phys Seg Info"); #if MAXMEMDOM > 1 static int sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS); SYSCTL_OID(_vm, OID_AUTO, phys_locality, CTLTYPE_STRING | CTLFLAG_RD, NULL, 0, sysctl_vm_phys_locality, "A", "Phys Locality Info"); #endif SYSCTL_INT(_vm, OID_AUTO, ndomains, CTLFLAG_RD, &vm_ndomains, 0, "Number of physical memory domains available."); /* * Default to first-touch + round-robin. */ static struct mtx vm_default_policy_mtx; MTX_SYSINIT(vm_default_policy, &vm_default_policy_mtx, "default policy mutex", MTX_DEF); #if MAXMEMDOM > 1 static struct vm_domain_policy vm_default_policy = VM_DOMAIN_POLICY_STATIC_INITIALISER(VM_POLICY_FIRST_TOUCH_ROUND_ROBIN, 0); #else /* Use round-robin so the domain policy code will only try once per allocation */ static struct vm_domain_policy vm_default_policy = VM_DOMAIN_POLICY_STATIC_INITIALISER(VM_POLICY_ROUND_ROBIN, 0); #endif static vm_page_t vm_phys_alloc_domain_pages(int domain, int flind, int pool, int order); +static vm_page_t vm_phys_alloc_seg_contig(struct vm_phys_seg *seg, + u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, + vm_paddr_t boundary); static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain); static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end); static int vm_phys_paddr_to_segind(vm_paddr_t pa); static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, int order); static int sysctl_vm_default_policy(SYSCTL_HANDLER_ARGS) { char policy_name[32]; int error; mtx_lock(&vm_default_policy_mtx); /* Map policy to output string */ switch (vm_default_policy.p.policy) { case VM_POLICY_FIRST_TOUCH: strcpy(policy_name, "first-touch"); break; case VM_POLICY_FIRST_TOUCH_ROUND_ROBIN: strcpy(policy_name, "first-touch-rr"); break; case VM_POLICY_ROUND_ROBIN: default: strcpy(policy_name, "rr"); break; } mtx_unlock(&vm_default_policy_mtx); error = sysctl_handle_string(oidp, &policy_name[0], sizeof(policy_name), req); if (error != 0 || req->newptr == NULL) return (error); mtx_lock(&vm_default_policy_mtx); /* Set: match on the subset of policies that make sense as a default */ if (strcmp("first-touch-rr", policy_name) == 0) { vm_domain_policy_set(&vm_default_policy, VM_POLICY_FIRST_TOUCH_ROUND_ROBIN, 0); } else if (strcmp("first-touch", policy_name) == 0) { vm_domain_policy_set(&vm_default_policy, VM_POLICY_FIRST_TOUCH, 0); } else if (strcmp("rr", policy_name) == 0) { vm_domain_policy_set(&vm_default_policy, VM_POLICY_ROUND_ROBIN, 0); } else { error = EINVAL; goto finish; } error = 0; finish: mtx_unlock(&vm_default_policy_mtx); return (error); } SYSCTL_PROC(_vm, OID_AUTO, default_policy, CTLTYPE_STRING | CTLFLAG_RW, 0, 0, sysctl_vm_default_policy, "A", "Default policy (rr, first-touch, first-touch-rr"); /* * Red-black tree helpers for vm fictitious range management. */ static inline int vm_phys_fictitious_in_range(struct vm_phys_fictitious_seg *p, struct vm_phys_fictitious_seg *range) { KASSERT(range->start != 0 && range->end != 0, ("Invalid range passed on search for vm_fictitious page")); if (p->start >= range->end) return (1); if (p->start < range->start) return (-1); return (0); } static int vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *p1, struct vm_phys_fictitious_seg *p2) { /* Check if this is a search for a page */ if (p1->end == 0) return (vm_phys_fictitious_in_range(p1, p2)); KASSERT(p2->end != 0, ("Invalid range passed as second parameter to vm fictitious comparison")); /* Searching to add a new range */ if (p1->end <= p2->start) return (-1); if (p1->start >= p2->end) return (1); panic("Trying to add overlapping vm fictitious ranges:\n" "[%#jx:%#jx] and [%#jx:%#jx]", (uintmax_t)p1->start, (uintmax_t)p1->end, (uintmax_t)p2->start, (uintmax_t)p2->end); } static __inline int vm_rr_selectdomain(void) { #if MAXMEMDOM > 1 struct thread *td; td = curthread; td->td_dom_rr_idx++; td->td_dom_rr_idx %= vm_ndomains; return (td->td_dom_rr_idx); #else return (0); #endif } /* * Initialise a VM domain iterator. * * Check the thread policy, then the proc policy, * then default to the system policy. * * Later on the various layers will have this logic * plumbed into them and the phys code will be explicitly * handed a VM domain policy to use. */ static void vm_policy_iterator_init(struct vm_domain_iterator *vi) { #if MAXMEMDOM > 1 struct vm_domain_policy lcl; #endif vm_domain_iterator_init(vi); #if MAXMEMDOM > 1 /* Copy out the thread policy */ vm_domain_policy_localcopy(&lcl, &curthread->td_vm_dom_policy); if (lcl.p.policy != VM_POLICY_NONE) { /* Thread policy is present; use it */ vm_domain_iterator_set_policy(vi, &lcl); return; } vm_domain_policy_localcopy(&lcl, &curthread->td_proc->p_vm_dom_policy); if (lcl.p.policy != VM_POLICY_NONE) { /* Process policy is present; use it */ vm_domain_iterator_set_policy(vi, &lcl); return; } #endif /* Use system default policy */ vm_domain_iterator_set_policy(vi, &vm_default_policy); } static void vm_policy_iterator_finish(struct vm_domain_iterator *vi) { vm_domain_iterator_cleanup(vi); } boolean_t vm_phys_domain_intersects(long mask, vm_paddr_t low, vm_paddr_t high) { struct vm_phys_seg *s; int idx; while ((idx = ffsl(mask)) != 0) { idx--; /* ffsl counts from 1 */ mask &= ~(1UL << idx); s = &vm_phys_segs[idx]; if (low < s->end && high > s->start) return (TRUE); } return (FALSE); } /* * Outputs the state of the physical memory allocator, specifically, * the amount of physical memory in each free list. */ static int sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS) { struct sbuf sbuf; struct vm_freelist *fl; int dom, error, flind, oind, pind; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sbuf_new_for_sysctl(&sbuf, NULL, 128 * vm_ndomains, req); for (dom = 0; dom < vm_ndomains; dom++) { sbuf_printf(&sbuf,"\nDOMAIN %d:\n", dom); for (flind = 0; flind < vm_nfreelists; flind++) { sbuf_printf(&sbuf, "\nFREE LIST %d:\n" "\n ORDER (SIZE) | NUMBER" "\n ", flind); for (pind = 0; pind < VM_NFREEPOOL; pind++) sbuf_printf(&sbuf, " | POOL %d", pind); sbuf_printf(&sbuf, "\n-- "); for (pind = 0; pind < VM_NFREEPOOL; pind++) sbuf_printf(&sbuf, "-- -- "); sbuf_printf(&sbuf, "--\n"); for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) { sbuf_printf(&sbuf, " %2d (%6dK)", oind, 1 << (PAGE_SHIFT - 10 + oind)); for (pind = 0; pind < VM_NFREEPOOL; pind++) { fl = vm_phys_free_queues[dom][flind][pind]; sbuf_printf(&sbuf, " | %6d", fl[oind].lcnt); } sbuf_printf(&sbuf, "\n"); } } } error = sbuf_finish(&sbuf); sbuf_delete(&sbuf); return (error); } /* * Outputs the set of physical memory segments. */ static int sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS) { struct sbuf sbuf; struct vm_phys_seg *seg; int error, segind; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sbuf_new_for_sysctl(&sbuf, NULL, 128, req); for (segind = 0; segind < vm_phys_nsegs; segind++) { sbuf_printf(&sbuf, "\nSEGMENT %d:\n\n", segind); seg = &vm_phys_segs[segind]; sbuf_printf(&sbuf, "start: %#jx\n", (uintmax_t)seg->start); sbuf_printf(&sbuf, "end: %#jx\n", (uintmax_t)seg->end); sbuf_printf(&sbuf, "domain: %d\n", seg->domain); sbuf_printf(&sbuf, "free list: %p\n", seg->free_queues); } error = sbuf_finish(&sbuf); sbuf_delete(&sbuf); return (error); } /* * Return affinity, or -1 if there's no affinity information. */ int vm_phys_mem_affinity(int f, int t) { #if MAXMEMDOM > 1 if (mem_locality == NULL) return (-1); if (f >= vm_ndomains || t >= vm_ndomains) return (-1); return (mem_locality[f * vm_ndomains + t]); #else return (-1); #endif } #if MAXMEMDOM > 1 /* * Outputs the VM locality table. */ static int sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS) { struct sbuf sbuf; int error, i, j; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sbuf_new_for_sysctl(&sbuf, NULL, 128, req); sbuf_printf(&sbuf, "\n"); for (i = 0; i < vm_ndomains; i++) { sbuf_printf(&sbuf, "%d: ", i); for (j = 0; j < vm_ndomains; j++) { sbuf_printf(&sbuf, "%d ", vm_phys_mem_affinity(i, j)); } sbuf_printf(&sbuf, "\n"); } error = sbuf_finish(&sbuf); sbuf_delete(&sbuf); return (error); } #endif static void vm_freelist_add(struct vm_freelist *fl, vm_page_t m, int order, int tail) { m->order = order; if (tail) TAILQ_INSERT_TAIL(&fl[order].pl, m, plinks.q); else TAILQ_INSERT_HEAD(&fl[order].pl, m, plinks.q); fl[order].lcnt++; } static void vm_freelist_rem(struct vm_freelist *fl, vm_page_t m, int order) { TAILQ_REMOVE(&fl[order].pl, m, plinks.q); fl[order].lcnt--; m->order = VM_NFREEORDER; } /* * Create a physical memory segment. */ static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain) { struct vm_phys_seg *seg; KASSERT(vm_phys_nsegs < VM_PHYSSEG_MAX, ("vm_phys_create_seg: increase VM_PHYSSEG_MAX")); KASSERT(domain < vm_ndomains, ("vm_phys_create_seg: invalid domain provided")); seg = &vm_phys_segs[vm_phys_nsegs++]; while (seg > vm_phys_segs && (seg - 1)->start >= end) { *seg = *(seg - 1); seg--; } seg->start = start; seg->end = end; seg->domain = domain; } static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end) { int i; if (mem_affinity == NULL) { _vm_phys_create_seg(start, end, 0); return; } for (i = 0;; i++) { if (mem_affinity[i].end == 0) panic("Reached end of affinity info"); if (mem_affinity[i].end <= start) continue; if (mem_affinity[i].start > start) panic("No affinity info for start %jx", (uintmax_t)start); if (mem_affinity[i].end >= end) { _vm_phys_create_seg(start, end, mem_affinity[i].domain); break; } _vm_phys_create_seg(start, mem_affinity[i].end, mem_affinity[i].domain); start = mem_affinity[i].end; } } /* * Add a physical memory segment. */ void vm_phys_add_seg(vm_paddr_t start, vm_paddr_t end) { vm_paddr_t paddr; KASSERT((start & PAGE_MASK) == 0, ("vm_phys_define_seg: start is not page aligned")); KASSERT((end & PAGE_MASK) == 0, ("vm_phys_define_seg: end is not page aligned")); /* * Split the physical memory segment if it spans two or more free * list boundaries. */ paddr = start; #ifdef VM_FREELIST_ISADMA if (paddr < VM_ISADMA_BOUNDARY && end > VM_ISADMA_BOUNDARY) { vm_phys_create_seg(paddr, VM_ISADMA_BOUNDARY); paddr = VM_ISADMA_BOUNDARY; } #endif #ifdef VM_FREELIST_LOWMEM if (paddr < VM_LOWMEM_BOUNDARY && end > VM_LOWMEM_BOUNDARY) { vm_phys_create_seg(paddr, VM_LOWMEM_BOUNDARY); paddr = VM_LOWMEM_BOUNDARY; } #endif #ifdef VM_FREELIST_DMA32 if (paddr < VM_DMA32_BOUNDARY && end > VM_DMA32_BOUNDARY) { vm_phys_create_seg(paddr, VM_DMA32_BOUNDARY); paddr = VM_DMA32_BOUNDARY; } #endif vm_phys_create_seg(paddr, end); } /* * Initialize the physical memory allocator. * * Requires that vm_page_array is initialized! */ void vm_phys_init(void) { struct vm_freelist *fl; struct vm_phys_seg *seg; u_long npages; int dom, flind, freelist, oind, pind, segind; /* * Compute the number of free lists, and generate the mapping from the * manifest constants VM_FREELIST_* to the free list indices. * * Initially, the entries of vm_freelist_to_flind[] are set to either * 0 or 1 to indicate which free lists should be created. */ npages = 0; for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) { seg = &vm_phys_segs[segind]; #ifdef VM_FREELIST_ISADMA if (seg->end <= VM_ISADMA_BOUNDARY) vm_freelist_to_flind[VM_FREELIST_ISADMA] = 1; else #endif #ifdef VM_FREELIST_LOWMEM if (seg->end <= VM_LOWMEM_BOUNDARY) vm_freelist_to_flind[VM_FREELIST_LOWMEM] = 1; else #endif #ifdef VM_FREELIST_DMA32 if ( #ifdef VM_DMA32_NPAGES_THRESHOLD /* * Create the DMA32 free list only if the amount of * physical memory above physical address 4G exceeds the * given threshold. */ npages > VM_DMA32_NPAGES_THRESHOLD && #endif seg->end <= VM_DMA32_BOUNDARY) vm_freelist_to_flind[VM_FREELIST_DMA32] = 1; else #endif { npages += atop(seg->end - seg->start); vm_freelist_to_flind[VM_FREELIST_DEFAULT] = 1; } } /* Change each entry into a running total of the free lists. */ for (freelist = 1; freelist < VM_NFREELIST; freelist++) { vm_freelist_to_flind[freelist] += vm_freelist_to_flind[freelist - 1]; } vm_nfreelists = vm_freelist_to_flind[VM_NFREELIST - 1]; KASSERT(vm_nfreelists > 0, ("vm_phys_init: no free lists")); /* Change each entry into a free list index. */ for (freelist = 0; freelist < VM_NFREELIST; freelist++) vm_freelist_to_flind[freelist]--; /* * Initialize the first_page and free_queues fields of each physical * memory segment. */ #ifdef VM_PHYSSEG_SPARSE npages = 0; #endif for (segind = 0; segind < vm_phys_nsegs; segind++) { seg = &vm_phys_segs[segind]; #ifdef VM_PHYSSEG_SPARSE seg->first_page = &vm_page_array[npages]; npages += atop(seg->end - seg->start); #else seg->first_page = PHYS_TO_VM_PAGE(seg->start); #endif #ifdef VM_FREELIST_ISADMA if (seg->end <= VM_ISADMA_BOUNDARY) { flind = vm_freelist_to_flind[VM_FREELIST_ISADMA]; KASSERT(flind >= 0, ("vm_phys_init: ISADMA flind < 0")); } else #endif #ifdef VM_FREELIST_LOWMEM if (seg->end <= VM_LOWMEM_BOUNDARY) { flind = vm_freelist_to_flind[VM_FREELIST_LOWMEM]; KASSERT(flind >= 0, ("vm_phys_init: LOWMEM flind < 0")); } else #endif #ifdef VM_FREELIST_DMA32 if (seg->end <= VM_DMA32_BOUNDARY) { flind = vm_freelist_to_flind[VM_FREELIST_DMA32]; KASSERT(flind >= 0, ("vm_phys_init: DMA32 flind < 0")); } else #endif { flind = vm_freelist_to_flind[VM_FREELIST_DEFAULT]; KASSERT(flind >= 0, ("vm_phys_init: DEFAULT flind < 0")); } seg->free_queues = &vm_phys_free_queues[seg->domain][flind]; } /* * Initialize the free queues. */ for (dom = 0; dom < vm_ndomains; dom++) { for (flind = 0; flind < vm_nfreelists; flind++) { for (pind = 0; pind < VM_NFREEPOOL; pind++) { fl = vm_phys_free_queues[dom][flind][pind]; for (oind = 0; oind < VM_NFREEORDER; oind++) TAILQ_INIT(&fl[oind].pl); } } } rw_init(&vm_phys_fictitious_reg_lock, "vmfctr"); } /* * Split a contiguous, power of two-sized set of physical pages. */ static __inline void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, int order) { vm_page_t m_buddy; while (oind > order) { oind--; m_buddy = &m[1 << oind]; KASSERT(m_buddy->order == VM_NFREEORDER, ("vm_phys_split_pages: page %p has unexpected order %d", m_buddy, m_buddy->order)); vm_freelist_add(fl, m_buddy, oind, 0); } } /* * Initialize a physical page and add it to the free lists. */ void vm_phys_add_page(vm_paddr_t pa) { vm_page_t m; struct vm_domain *vmd; vm_cnt.v_page_count++; m = vm_phys_paddr_to_vm_page(pa); m->phys_addr = pa; m->queue = PQ_NONE; m->segind = vm_phys_paddr_to_segind(pa); vmd = vm_phys_domain(m); vmd->vmd_page_count++; vmd->vmd_segs |= 1UL << m->segind; KASSERT(m->order == VM_NFREEORDER, ("vm_phys_add_page: page %p has unexpected order %d", m, m->order)); m->pool = VM_FREEPOOL_DEFAULT; pmap_page_init(m); mtx_lock(&vm_page_queue_free_mtx); vm_phys_freecnt_adj(m, 1); vm_phys_free_pages(m, 0); mtx_unlock(&vm_page_queue_free_mtx); } /* * Allocate a contiguous, power of two-sized set of physical pages * from the free lists. * * The free page queues must be locked. */ vm_page_t vm_phys_alloc_pages(int pool, int order) { vm_page_t m; int domain, flind; struct vm_domain_iterator vi; KASSERT(pool < VM_NFREEPOOL, ("vm_phys_alloc_pages: pool %d is out of range", pool)); KASSERT(order < VM_NFREEORDER, ("vm_phys_alloc_pages: order %d is out of range", order)); vm_policy_iterator_init(&vi); while ((vm_domain_iterator_run(&vi, &domain)) == 0) { for (flind = 0; flind < vm_nfreelists; flind++) { m = vm_phys_alloc_domain_pages(domain, flind, pool, order); if (m != NULL) return (m); } } vm_policy_iterator_finish(&vi); return (NULL); } /* * Allocate a contiguous, power of two-sized set of physical pages from the * specified free list. The free list must be specified using one of the * manifest constants VM_FREELIST_*. * * The free page queues must be locked. */ vm_page_t vm_phys_alloc_freelist_pages(int freelist, int pool, int order) { vm_page_t m; struct vm_domain_iterator vi; int domain; KASSERT(freelist < VM_NFREELIST, ("vm_phys_alloc_freelist_pages: freelist %d is out of range", freelist)); KASSERT(pool < VM_NFREEPOOL, ("vm_phys_alloc_freelist_pages: pool %d is out of range", pool)); KASSERT(order < VM_NFREEORDER, ("vm_phys_alloc_freelist_pages: order %d is out of range", order)); vm_policy_iterator_init(&vi); while ((vm_domain_iterator_run(&vi, &domain)) == 0) { m = vm_phys_alloc_domain_pages(domain, vm_freelist_to_flind[freelist], pool, order); if (m != NULL) return (m); } vm_policy_iterator_finish(&vi); return (NULL); } static vm_page_t vm_phys_alloc_domain_pages(int domain, int flind, int pool, int order) { struct vm_freelist *fl; struct vm_freelist *alt; int oind, pind; vm_page_t m; mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); fl = &vm_phys_free_queues[domain][flind][pool][0]; for (oind = order; oind < VM_NFREEORDER; oind++) { m = TAILQ_FIRST(&fl[oind].pl); if (m != NULL) { vm_freelist_rem(fl, m, oind); vm_phys_split_pages(m, oind, fl, order); return (m); } } /* * The given pool was empty. Find the largest * contiguous, power-of-two-sized set of pages in any * pool. Transfer these pages to the given pool, and * use them to satisfy the allocation. */ for (oind = VM_NFREEORDER - 1; oind >= order; oind--) { for (pind = 0; pind < VM_NFREEPOOL; pind++) { alt = &vm_phys_free_queues[domain][flind][pind][0]; m = TAILQ_FIRST(&alt[oind].pl); if (m != NULL) { vm_freelist_rem(alt, m, oind); vm_phys_set_pool(pool, m, oind); vm_phys_split_pages(m, oind, fl, order); return (m); } } } return (NULL); } /* * Find the vm_page corresponding to the given physical address. */ vm_page_t vm_phys_paddr_to_vm_page(vm_paddr_t pa) { struct vm_phys_seg *seg; int segind; for (segind = 0; segind < vm_phys_nsegs; segind++) { seg = &vm_phys_segs[segind]; if (pa >= seg->start && pa < seg->end) return (&seg->first_page[atop(pa - seg->start)]); } return (NULL); } vm_page_t vm_phys_fictitious_to_vm_page(vm_paddr_t pa) { struct vm_phys_fictitious_seg tmp, *seg; vm_page_t m; m = NULL; tmp.start = pa; tmp.end = 0; rw_rlock(&vm_phys_fictitious_reg_lock); seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp); rw_runlock(&vm_phys_fictitious_reg_lock); if (seg == NULL) return (NULL); m = &seg->first_page[atop(pa - seg->start)]; KASSERT((m->flags & PG_FICTITIOUS) != 0, ("%p not fictitious", m)); return (m); } static inline void vm_phys_fictitious_init_range(vm_page_t range, vm_paddr_t start, long page_count, vm_memattr_t memattr) { long i; for (i = 0; i < page_count; i++) { vm_page_initfake(&range[i], start + PAGE_SIZE * i, memattr); range[i].oflags &= ~VPO_UNMANAGED; range[i].busy_lock = VPB_UNBUSIED; } } int vm_phys_fictitious_reg_range(vm_paddr_t start, vm_paddr_t end, vm_memattr_t memattr) { struct vm_phys_fictitious_seg *seg; vm_page_t fp; long page_count; #ifdef VM_PHYSSEG_DENSE long pi, pe; long dpage_count; #endif KASSERT(start < end, ("Start of segment isn't less than end (start: %jx end: %jx)", (uintmax_t)start, (uintmax_t)end)); page_count = (end - start) / PAGE_SIZE; #ifdef VM_PHYSSEG_DENSE pi = atop(start); pe = atop(end); if (pi >= first_page && (pi - first_page) < vm_page_array_size) { fp = &vm_page_array[pi - first_page]; if ((pe - first_page) > vm_page_array_size) { /* * We have a segment that starts inside * of vm_page_array, but ends outside of it. * * Use vm_page_array pages for those that are * inside of the vm_page_array range, and * allocate the remaining ones. */ dpage_count = vm_page_array_size - (pi - first_page); vm_phys_fictitious_init_range(fp, start, dpage_count, memattr); page_count -= dpage_count; start += ptoa(dpage_count); goto alloc; } /* * We can allocate the full range from vm_page_array, * so there's no need to register the range in the tree. */ vm_phys_fictitious_init_range(fp, start, page_count, memattr); return (0); } else if (pe > first_page && (pe - first_page) < vm_page_array_size) { /* * We have a segment that ends inside of vm_page_array, * but starts outside of it. */ fp = &vm_page_array[0]; dpage_count = pe - first_page; vm_phys_fictitious_init_range(fp, ptoa(first_page), dpage_count, memattr); end -= ptoa(dpage_count); page_count -= dpage_count; goto alloc; } else if (pi < first_page && pe > (first_page + vm_page_array_size)) { /* * Trying to register a fictitious range that expands before * and after vm_page_array. */ return (EINVAL); } else { alloc: #endif fp = malloc(page_count * sizeof(struct vm_page), M_FICT_PAGES, M_WAITOK | M_ZERO); #ifdef VM_PHYSSEG_DENSE } #endif vm_phys_fictitious_init_range(fp, start, page_count, memattr); seg = malloc(sizeof(*seg), M_FICT_PAGES, M_WAITOK | M_ZERO); seg->start = start; seg->end = end; seg->first_page = fp; rw_wlock(&vm_phys_fictitious_reg_lock); RB_INSERT(fict_tree, &vm_phys_fictitious_tree, seg); rw_wunlock(&vm_phys_fictitious_reg_lock); return (0); } void vm_phys_fictitious_unreg_range(vm_paddr_t start, vm_paddr_t end) { struct vm_phys_fictitious_seg *seg, tmp; #ifdef VM_PHYSSEG_DENSE long pi, pe; #endif KASSERT(start < end, ("Start of segment isn't less than end (start: %jx end: %jx)", (uintmax_t)start, (uintmax_t)end)); #ifdef VM_PHYSSEG_DENSE pi = atop(start); pe = atop(end); if (pi >= first_page && (pi - first_page) < vm_page_array_size) { if ((pe - first_page) <= vm_page_array_size) { /* * This segment was allocated using vm_page_array * only, there's nothing to do since those pages * were never added to the tree. */ return; } /* * We have a segment that starts inside * of vm_page_array, but ends outside of it. * * Calculate how many pages were added to the * tree and free them. */ start = ptoa(first_page + vm_page_array_size); } else if (pe > first_page && (pe - first_page) < vm_page_array_size) { /* * We have a segment that ends inside of vm_page_array, * but starts outside of it. */ end = ptoa(first_page); } else if (pi < first_page && pe > (first_page + vm_page_array_size)) { /* Since it's not possible to register such a range, panic. */ panic( "Unregistering not registered fictitious range [%#jx:%#jx]", (uintmax_t)start, (uintmax_t)end); } #endif tmp.start = start; tmp.end = 0; rw_wlock(&vm_phys_fictitious_reg_lock); seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp); if (seg->start != start || seg->end != end) { rw_wunlock(&vm_phys_fictitious_reg_lock); panic( "Unregistering not registered fictitious range [%#jx:%#jx]", (uintmax_t)start, (uintmax_t)end); } RB_REMOVE(fict_tree, &vm_phys_fictitious_tree, seg); rw_wunlock(&vm_phys_fictitious_reg_lock); free(seg->first_page, M_FICT_PAGES); free(seg, M_FICT_PAGES); } /* * Find the segment containing the given physical address. */ static int vm_phys_paddr_to_segind(vm_paddr_t pa) { struct vm_phys_seg *seg; int segind; for (segind = 0; segind < vm_phys_nsegs; segind++) { seg = &vm_phys_segs[segind]; if (pa >= seg->start && pa < seg->end) return (segind); } panic("vm_phys_paddr_to_segind: paddr %#jx is not in any segment" , (uintmax_t)pa); } /* * Free a contiguous, power of two-sized set of physical pages. * * The free page queues must be locked. */ void vm_phys_free_pages(vm_page_t m, int order) { struct vm_freelist *fl; struct vm_phys_seg *seg; vm_paddr_t pa; vm_page_t m_buddy; KASSERT(m->order == VM_NFREEORDER, ("vm_phys_free_pages: page %p has unexpected order %d", m, m->order)); KASSERT(m->pool < VM_NFREEPOOL, ("vm_phys_free_pages: page %p has unexpected pool %d", m, m->pool)); KASSERT(order < VM_NFREEORDER, ("vm_phys_free_pages: order %d is out of range", order)); mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); seg = &vm_phys_segs[m->segind]; if (order < VM_NFREEORDER - 1) { pa = VM_PAGE_TO_PHYS(m); do { pa ^= ((vm_paddr_t)1 << (PAGE_SHIFT + order)); if (pa < seg->start || pa >= seg->end) break; m_buddy = &seg->first_page[atop(pa - seg->start)]; if (m_buddy->order != order) break; fl = (*seg->free_queues)[m_buddy->pool]; vm_freelist_rem(fl, m_buddy, order); if (m_buddy->pool != m->pool) vm_phys_set_pool(m->pool, m_buddy, order); order++; pa &= ~(((vm_paddr_t)1 << (PAGE_SHIFT + order)) - 1); m = &seg->first_page[atop(pa - seg->start)]; } while (order < VM_NFREEORDER - 1); } fl = (*seg->free_queues)[m->pool]; vm_freelist_add(fl, m, order, 1); } /* * Free a contiguous, arbitrarily sized set of physical pages. * * The free page queues must be locked. */ void vm_phys_free_contig(vm_page_t m, u_long npages) { u_int n; int order; /* * Avoid unnecessary coalescing by freeing the pages in the largest * possible power-of-two-sized subsets. */ mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); for (;; npages -= n) { /* * Unsigned "min" is used here so that "order" is assigned * "VM_NFREEORDER - 1" when "m"'s physical address is zero * or the low-order bits of its physical address are zero * because the size of a physical address exceeds the size of * a long. */ order = min(ffsl(VM_PAGE_TO_PHYS(m) >> PAGE_SHIFT) - 1, VM_NFREEORDER - 1); n = 1 << order; if (npages < n) break; vm_phys_free_pages(m, order); m += n; } /* The residual "npages" is less than "1 << (VM_NFREEORDER - 1)". */ for (; npages > 0; npages -= n) { order = flsl(npages) - 1; n = 1 << order; vm_phys_free_pages(m, order); m += n; } } /* + * Scan physical memory between the specified addresses "low" and "high" for a + * run of contiguous physical pages that satisfy the specified conditions, and + * return the lowest page in the run. The specified "alignment" determines + * the alignment of the lowest physical page in the run. If the specified + * "boundary" is non-zero, then the run of physical pages cannot span a + * physical address that is a multiple of "boundary". + * + * "npages" must be greater than zero. Both "alignment" and "boundary" must + * be a power of two. + */ +vm_page_t +vm_phys_scan_contig(u_long npages, vm_paddr_t low, vm_paddr_t high, + u_long alignment, vm_paddr_t boundary, int options) +{ + vm_paddr_t pa_end; + vm_page_t m_end, m_run, m_start; + struct vm_phys_seg *seg; + int segind; + + KASSERT(npages > 0, ("npages is 0")); + KASSERT(powerof2(alignment), ("alignment is not a power of 2")); + KASSERT(powerof2(boundary), ("boundary is not a power of 2")); + if (low >= high) + return (NULL); + for (segind = 0; segind < vm_phys_nsegs; segind++) { + seg = &vm_phys_segs[segind]; + if (seg->start >= high) + break; + if (low >= seg->end) + continue; + if (low <= seg->start) + m_start = seg->first_page; + else + m_start = &seg->first_page[atop(low - seg->start)]; + if (high < seg->end) + pa_end = high; + else + pa_end = seg->end; + if (pa_end - VM_PAGE_TO_PHYS(m_start) < ptoa(npages)) + continue; + m_end = &seg->first_page[atop(pa_end - seg->start)]; + m_run = vm_page_scan_contig(npages, m_start, m_end, + alignment, boundary, options); + if (m_run != NULL) + return (m_run); + } + return (NULL); +} + +/* * Set the pool for a contiguous, power of two-sized set of physical pages. */ void vm_phys_set_pool(int pool, vm_page_t m, int order) { vm_page_t m_tmp; for (m_tmp = m; m_tmp < &m[1 << order]; m_tmp++) m_tmp->pool = pool; } /* * Search for the given physical page "m" in the free lists. If the search * succeeds, remove "m" from the free lists and return TRUE. Otherwise, return * FALSE, indicating that "m" is not in the free lists. * * The free page queues must be locked. */ boolean_t vm_phys_unfree_page(vm_page_t m) { struct vm_freelist *fl; struct vm_phys_seg *seg; vm_paddr_t pa, pa_half; vm_page_t m_set, m_tmp; int order; mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); /* * First, find the contiguous, power of two-sized set of free * physical pages containing the given physical page "m" and * assign it to "m_set". */ seg = &vm_phys_segs[m->segind]; for (m_set = m, order = 0; m_set->order == VM_NFREEORDER && order < VM_NFREEORDER - 1; ) { order++; pa = m->phys_addr & (~(vm_paddr_t)0 << (PAGE_SHIFT + order)); if (pa >= seg->start) m_set = &seg->first_page[atop(pa - seg->start)]; else return (FALSE); } if (m_set->order < order) return (FALSE); if (m_set->order == VM_NFREEORDER) return (FALSE); KASSERT(m_set->order < VM_NFREEORDER, ("vm_phys_unfree_page: page %p has unexpected order %d", m_set, m_set->order)); /* * Next, remove "m_set" from the free lists. Finally, extract * "m" from "m_set" using an iterative algorithm: While "m_set" * is larger than a page, shrink "m_set" by returning the half * of "m_set" that does not contain "m" to the free lists. */ fl = (*seg->free_queues)[m_set->pool]; order = m_set->order; vm_freelist_rem(fl, m_set, order); while (order > 0) { order--; pa_half = m_set->phys_addr ^ (1 << (PAGE_SHIFT + order)); if (m->phys_addr < pa_half) m_tmp = &seg->first_page[atop(pa_half - seg->start)]; else { m_tmp = m_set; m_set = &seg->first_page[atop(pa_half - seg->start)]; } vm_freelist_add(fl, m_tmp, order, 0); } KASSERT(m_set == m, ("vm_phys_unfree_page: fatal inconsistency")); return (TRUE); } /* * Try to zero one physical page. Used by an idle priority thread. */ boolean_t vm_phys_zero_pages_idle(void) { static struct vm_freelist *fl; static int flind, oind, pind; vm_page_t m, m_tmp; int domain; domain = vm_rr_selectdomain(); fl = vm_phys_free_queues[domain][0][0]; mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); for (;;) { TAILQ_FOREACH_REVERSE(m, &fl[oind].pl, pglist, plinks.q) { for (m_tmp = m; m_tmp < &m[1 << oind]; m_tmp++) { if ((m_tmp->flags & (PG_CACHED | PG_ZERO)) == 0) { vm_phys_unfree_page(m_tmp); vm_phys_freecnt_adj(m, -1); mtx_unlock(&vm_page_queue_free_mtx); pmap_zero_page_idle(m_tmp); m_tmp->flags |= PG_ZERO; mtx_lock(&vm_page_queue_free_mtx); vm_phys_freecnt_adj(m, 1); vm_phys_free_pages(m_tmp, 0); vm_page_zero_count++; cnt_prezero++; return (TRUE); } } } oind++; if (oind == VM_NFREEORDER) { oind = 0; pind++; if (pind == VM_NFREEPOOL) { pind = 0; flind++; if (flind == vm_nfreelists) flind = 0; } fl = vm_phys_free_queues[domain][flind][pind]; } } } /* * Allocate a contiguous set of physical pages of the given size * "npages" from the free lists. All of the physical pages must be at * or above the given physical address "low" and below the given * physical address "high". The given value "alignment" determines the * alignment of the first physical page in the set. If the given value * "boundary" is non-zero, then the set of physical pages cannot cross * any physical address boundary that is a multiple of that value. Both * "alignment" and "boundary" must be a power of two. */ vm_page_t vm_phys_alloc_contig(u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary) { - struct vm_freelist *fl; - struct vm_phys_seg *seg; - vm_paddr_t pa, pa_last, size; - vm_page_t m, m_ret; - u_long npages_end; - int domain, flind, oind, order, pind; + vm_paddr_t pa_end, pa_start; + vm_page_t m_run; struct vm_domain_iterator vi; + struct vm_phys_seg *seg; + int domain, segind; + KASSERT(npages > 0, ("npages is 0")); + KASSERT(powerof2(alignment), ("alignment is not a power of 2")); + KASSERT(powerof2(boundary), ("boundary is not a power of 2")); mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); - size = npages << PAGE_SHIFT; - KASSERT(size != 0, - ("vm_phys_alloc_contig: size must not be 0")); - KASSERT((alignment & (alignment - 1)) == 0, - ("vm_phys_alloc_contig: alignment must be a power of 2")); - KASSERT((boundary & (boundary - 1)) == 0, - ("vm_phys_alloc_contig: boundary must be a power of 2")); - /* Compute the queue that is the best fit for npages. */ - for (order = 0; (1 << order) < npages; order++); - + if (low >= high) + return (NULL); vm_policy_iterator_init(&vi); - restartdom: if (vm_domain_iterator_run(&vi, &domain) != 0) { vm_policy_iterator_finish(&vi); return (NULL); } + m_run = NULL; + for (segind = 0; segind < vm_phys_nsegs; segind++) { + seg = &vm_phys_segs[segind]; + if (seg->start >= high) + break; + if (low >= seg->end || seg->domain != domain) + continue; + if (low <= seg->start) + pa_start = seg->start; + else + pa_start = low; + if (high < seg->end) + pa_end = high; + else + pa_end = seg->end; + if (pa_end - pa_start < ptoa(npages)) + continue; + m_run = vm_phys_alloc_seg_contig(seg, npages, low, high, + alignment, boundary); + if (m_run != NULL) + break; + } + if (m_run == NULL && !vm_domain_iterator_isdone(&vi)) + goto restartdom; + vm_policy_iterator_finish(&vi); + return (m_run); +} - for (flind = 0; flind < vm_nfreelists; flind++) { - for (oind = min(order, VM_NFREEORDER - 1); oind < VM_NFREEORDER; oind++) { - for (pind = 0; pind < VM_NFREEPOOL; pind++) { - fl = &vm_phys_free_queues[domain][flind][pind][0]; - TAILQ_FOREACH(m_ret, &fl[oind].pl, plinks.q) { - /* - * A free list may contain physical pages - * from one or more segments. - */ - seg = &vm_phys_segs[m_ret->segind]; - if (seg->start > high || - low >= seg->end) - continue; +/* + * Allocate a run of contiguous physical pages from the free list for the + * specified segment. + */ +static vm_page_t +vm_phys_alloc_seg_contig(struct vm_phys_seg *seg, u_long npages, + vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary) +{ + struct vm_freelist *fl; + vm_paddr_t pa, pa_end, size; + vm_page_t m, m_ret; + u_long npages_end; + int oind, order, pind; + KASSERT(npages > 0, ("npages is 0")); + KASSERT(powerof2(alignment), ("alignment is not a power of 2")); + KASSERT(powerof2(boundary), ("boundary is not a power of 2")); + mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); + /* Compute the queue that is the best fit for npages. */ + for (order = 0; (1 << order) < npages; order++); + /* Search for a run satisfying the specified conditions. */ + size = npages << PAGE_SHIFT; + for (oind = min(order, VM_NFREEORDER - 1); oind < VM_NFREEORDER; + oind++) { + for (pind = 0; pind < VM_NFREEPOOL; pind++) { + fl = (*seg->free_queues)[pind]; + TAILQ_FOREACH(m_ret, &fl[oind].pl, plinks.q) { + /* + * Is the size of this allocation request + * larger than the largest block size? + */ + if (order >= VM_NFREEORDER) { /* - * Is the size of this allocation request - * larger than the largest block size? + * Determine if a sufficient number of + * subsequent blocks to satisfy the + * allocation request are free. */ - if (order >= VM_NFREEORDER) { - /* - * Determine if a sufficient number - * of subsequent blocks to satisfy - * the allocation request are free. - */ - pa = VM_PAGE_TO_PHYS(m_ret); - pa_last = pa + size; - for (;;) { - pa += 1 << (PAGE_SHIFT + VM_NFREEORDER - 1); - if (pa >= pa_last) - break; - if (pa < seg->start || - pa >= seg->end) - break; - m = &seg->first_page[atop(pa - seg->start)]; - if (m->order != VM_NFREEORDER - 1) - break; - } - /* If not, continue to the next block. */ - if (pa < pa_last) - continue; - } - - /* - * Determine if the blocks are within the given range, - * satisfy the given alignment, and do not cross the - * given boundary. - */ pa = VM_PAGE_TO_PHYS(m_ret); - if (pa >= low && - pa + size <= high && - (pa & (alignment - 1)) == 0 && - ((pa ^ (pa + size - 1)) & ~(boundary - 1)) == 0) - goto done; + pa_end = pa + size; + for (;;) { + pa += 1 << (PAGE_SHIFT + + VM_NFREEORDER - 1); + if (pa >= pa_end || + pa < seg->start || + pa >= seg->end) + break; + m = &seg->first_page[atop(pa - + seg->start)]; + if (m->order != VM_NFREEORDER - + 1) + break; + } + /* If not, go to the next block. */ + if (pa < pa_end) + continue; } + + /* + * Determine if the blocks are within the + * given range, satisfy the given alignment, + * and do not cross the given boundary. + */ + pa = VM_PAGE_TO_PHYS(m_ret); + pa_end = pa + size; + if (pa >= low && pa_end <= high && (pa & + (alignment - 1)) == 0 && ((pa ^ (pa_end - + 1)) & ~(boundary - 1)) == 0) + goto done; } } } - if (!vm_domain_iterator_isdone(&vi)) - goto restartdom; - vm_policy_iterator_finish(&vi); return (NULL); done: for (m = m_ret; m < &m_ret[npages]; m = &m[1 << oind]) { fl = (*seg->free_queues)[m->pool]; vm_freelist_rem(fl, m, m->order); } if (m_ret->pool != VM_FREEPOOL_DEFAULT) vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m_ret, oind); fl = (*seg->free_queues)[m_ret->pool]; vm_phys_split_pages(m_ret, oind, fl, order); /* Return excess pages to the free lists. */ npages_end = roundup2(npages, 1 << imin(oind, order)); if (npages < npages_end) vm_phys_free_contig(&m_ret[npages], npages_end - npages); return (m_ret); } #ifdef DDB /* * Show the number of physical pages in each of the free lists. */ DB_SHOW_COMMAND(freepages, db_show_freepages) { struct vm_freelist *fl; int flind, oind, pind, dom; for (dom = 0; dom < vm_ndomains; dom++) { db_printf("DOMAIN: %d\n", dom); for (flind = 0; flind < vm_nfreelists; flind++) { db_printf("FREE LIST %d:\n" "\n ORDER (SIZE) | NUMBER" "\n ", flind); for (pind = 0; pind < VM_NFREEPOOL; pind++) db_printf(" | POOL %d", pind); db_printf("\n-- "); for (pind = 0; pind < VM_NFREEPOOL; pind++) db_printf("-- -- "); db_printf("--\n"); for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) { db_printf(" %2.2d (%6.6dK)", oind, 1 << (PAGE_SHIFT - 10 + oind)); for (pind = 0; pind < VM_NFREEPOOL; pind++) { fl = vm_phys_free_queues[dom][flind][pind]; db_printf(" | %6.6d", fl[oind].lcnt); } db_printf("\n"); } db_printf("\n"); } db_printf("\n"); } } #endif Index: head/sys/vm/vm_phys.h =================================================================== --- head/sys/vm/vm_phys.h (revision 292468) +++ head/sys/vm/vm_phys.h (revision 292469) @@ -1,124 +1,126 @@ /*- * Copyright (c) 2002-2006 Rice University * Copyright (c) 2007 Alan L. Cox * All rights reserved. * * This software was developed for the FreeBSD Project by Alan L. Cox, * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ /* * Physical memory system definitions */ #ifndef _VM_PHYS_H_ #define _VM_PHYS_H_ #ifdef _KERNEL /* Domains must be dense (non-sparse) and zero-based. */ struct mem_affinity { vm_paddr_t start; vm_paddr_t end; int domain; }; struct vm_freelist { struct pglist pl; int lcnt; }; struct vm_phys_seg { vm_paddr_t start; vm_paddr_t end; vm_page_t first_page; int domain; struct vm_freelist (*free_queues)[VM_NFREEPOOL][VM_NFREEORDER]; }; extern struct mem_affinity *mem_affinity; extern int *mem_locality; extern int vm_ndomains; extern struct vm_phys_seg vm_phys_segs[]; extern int vm_phys_nsegs; /* * The following functions are only to be used by the virtual memory system. */ void vm_phys_add_page(vm_paddr_t pa); void vm_phys_add_seg(vm_paddr_t start, vm_paddr_t end); vm_page_t vm_phys_alloc_contig(u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary); vm_page_t vm_phys_alloc_freelist_pages(int freelist, int pool, int order); vm_page_t vm_phys_alloc_pages(int pool, int order); boolean_t vm_phys_domain_intersects(long mask, vm_paddr_t low, vm_paddr_t high); int vm_phys_fictitious_reg_range(vm_paddr_t start, vm_paddr_t end, vm_memattr_t memattr); void vm_phys_fictitious_unreg_range(vm_paddr_t start, vm_paddr_t end); vm_page_t vm_phys_fictitious_to_vm_page(vm_paddr_t pa); void vm_phys_free_contig(vm_page_t m, u_long npages); void vm_phys_free_pages(vm_page_t m, int order); void vm_phys_init(void); vm_page_t vm_phys_paddr_to_vm_page(vm_paddr_t pa); +vm_page_t vm_phys_scan_contig(u_long npages, vm_paddr_t low, vm_paddr_t high, + u_long alignment, vm_paddr_t boundary, int options); void vm_phys_set_pool(int pool, vm_page_t m, int order); boolean_t vm_phys_unfree_page(vm_page_t m); boolean_t vm_phys_zero_pages_idle(void); int vm_phys_mem_affinity(int f, int t); /* * vm_phys_domain: * * Return the memory domain the page belongs to. */ static inline struct vm_domain * vm_phys_domain(vm_page_t m) { #if MAXMEMDOM > 1 int domn, segind; /* XXXKIB try to assert that the page is managed */ segind = m->segind; KASSERT(segind < vm_phys_nsegs, ("segind %d m %p", segind, m)); domn = vm_phys_segs[segind].domain; KASSERT(domn < vm_ndomains, ("domain %d m %p", domn, m)); return (&vm_dom[domn]); #else return (&vm_dom[0]); #endif } static inline void vm_phys_freecnt_adj(vm_page_t m, int adj) { mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); vm_cnt.v_free_count += adj; vm_phys_domain(m)->vmd_free_count += adj; } #endif /* _KERNEL */ #endif /* !_VM_PHYS_H_ */ Index: head/sys/vm/vm_reserv.c =================================================================== --- head/sys/vm/vm_reserv.c (revision 292468) +++ head/sys/vm/vm_reserv.c (revision 292469) @@ -1,1112 +1,1158 @@ /*- * Copyright (c) 2002-2006 Rice University * Copyright (c) 2007-2011 Alan L. Cox * All rights reserved. * * This software was developed for the FreeBSD Project by Alan L. Cox, * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Superpage reservation management module * * Any external functions defined by this module are only to be used by the * virtual memory system. */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * The reservation system supports the speculative allocation of large physical * pages ("superpages"). Speculative allocation enables the fully-automatic * utilization of superpages by the virtual memory system. In other words, no * programmatic directives are required to use superpages. */ #if VM_NRESERVLEVEL > 0 /* * The number of small pages that are contained in a level 0 reservation */ #define VM_LEVEL_0_NPAGES (1 << VM_LEVEL_0_ORDER) /* * The number of bits by which a physical address is shifted to obtain the * reservation number */ #define VM_LEVEL_0_SHIFT (VM_LEVEL_0_ORDER + PAGE_SHIFT) /* * The size of a level 0 reservation in bytes */ #define VM_LEVEL_0_SIZE (1 << VM_LEVEL_0_SHIFT) /* * Computes the index of the small page underlying the given (object, pindex) * within the reservation's array of small pages. */ #define VM_RESERV_INDEX(object, pindex) \ (((object)->pg_color + (pindex)) & (VM_LEVEL_0_NPAGES - 1)) /* * The size of a population map entry */ typedef u_long popmap_t; /* * The number of bits in a population map entry */ #define NBPOPMAP (NBBY * sizeof(popmap_t)) /* * The number of population map entries in a reservation */ #define NPOPMAP howmany(VM_LEVEL_0_NPAGES, NBPOPMAP) /* * Clear a bit in the population map. */ static __inline void popmap_clear(popmap_t popmap[], int i) { popmap[i / NBPOPMAP] &= ~(1UL << (i % NBPOPMAP)); } /* * Set a bit in the population map. */ static __inline void popmap_set(popmap_t popmap[], int i) { popmap[i / NBPOPMAP] |= 1UL << (i % NBPOPMAP); } /* * Is a bit in the population map clear? */ static __inline boolean_t popmap_is_clear(popmap_t popmap[], int i) { return ((popmap[i / NBPOPMAP] & (1UL << (i % NBPOPMAP))) == 0); } /* * Is a bit in the population map set? */ static __inline boolean_t popmap_is_set(popmap_t popmap[], int i) { return ((popmap[i / NBPOPMAP] & (1UL << (i % NBPOPMAP))) != 0); } /* * The reservation structure * * A reservation structure is constructed whenever a large physical page is * speculatively allocated to an object. The reservation provides the small * physical pages for the range [pindex, pindex + VM_LEVEL_0_NPAGES) of offsets * within that object. The reservation's "popcnt" tracks the number of these * small physical pages that are in use at any given time. When and if the * reservation is not fully utilized, it appears in the queue of partially- * populated reservations. The reservation always appears on the containing * object's list of reservations. * * A partially-populated reservation can be broken and reclaimed at any time. */ struct vm_reserv { TAILQ_ENTRY(vm_reserv) partpopq; LIST_ENTRY(vm_reserv) objq; vm_object_t object; /* containing object */ vm_pindex_t pindex; /* offset within object */ vm_page_t pages; /* first page of a superpage */ int popcnt; /* # of pages in use */ char inpartpopq; popmap_t popmap[NPOPMAP]; /* bit vector of used pages */ }; /* * The reservation array * * This array is analoguous in function to vm_page_array. It differs in the * respect that it may contain a greater number of useful reservation * structures than there are (physical) superpages. These "invalid" * reservation structures exist to trade-off space for time in the * implementation of vm_reserv_from_page(). Invalid reservation structures are * distinguishable from "valid" reservation structures by inspecting the * reservation's "pages" field. Invalid reservation structures have a NULL * "pages" field. * * vm_reserv_from_page() maps a small (physical) page to an element of this * array by computing a physical reservation number from the page's physical * address. The physical reservation number is used as the array index. * * An "active" reservation is a valid reservation structure that has a non-NULL * "object" field and a non-zero "popcnt" field. In other words, every active * reservation belongs to a particular object. Moreover, every active * reservation has an entry in the containing object's list of reservations. */ static vm_reserv_t vm_reserv_array; /* * The partially-populated reservation queue * * This queue enables the fast recovery of an unused cached or free small page * from a partially-populated reservation. The reservation at the head of * this queue is the least-recently-changed, partially-populated reservation. * * Access to this queue is synchronized by the free page queue lock. */ static TAILQ_HEAD(, vm_reserv) vm_rvq_partpop = TAILQ_HEAD_INITIALIZER(vm_rvq_partpop); static SYSCTL_NODE(_vm, OID_AUTO, reserv, CTLFLAG_RD, 0, "Reservation Info"); static long vm_reserv_broken; SYSCTL_LONG(_vm_reserv, OID_AUTO, broken, CTLFLAG_RD, &vm_reserv_broken, 0, "Cumulative number of broken reservations"); static long vm_reserv_freed; SYSCTL_LONG(_vm_reserv, OID_AUTO, freed, CTLFLAG_RD, &vm_reserv_freed, 0, "Cumulative number of freed reservations"); static int sysctl_vm_reserv_fullpop(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_vm_reserv, OID_AUTO, fullpop, CTLTYPE_INT | CTLFLAG_RD, NULL, 0, sysctl_vm_reserv_fullpop, "I", "Current number of full reservations"); static int sysctl_vm_reserv_partpopq(SYSCTL_HANDLER_ARGS); SYSCTL_OID(_vm_reserv, OID_AUTO, partpopq, CTLTYPE_STRING | CTLFLAG_RD, NULL, 0, sysctl_vm_reserv_partpopq, "A", "Partially-populated reservation queues"); static long vm_reserv_reclaimed; SYSCTL_LONG(_vm_reserv, OID_AUTO, reclaimed, CTLFLAG_RD, &vm_reserv_reclaimed, 0, "Cumulative number of reclaimed reservations"); static void vm_reserv_break(vm_reserv_t rv, vm_page_t m); static void vm_reserv_depopulate(vm_reserv_t rv, int index); static vm_reserv_t vm_reserv_from_page(vm_page_t m); static boolean_t vm_reserv_has_pindex(vm_reserv_t rv, vm_pindex_t pindex); static void vm_reserv_populate(vm_reserv_t rv, int index); static void vm_reserv_reclaim(vm_reserv_t rv); /* * Returns the current number of full reservations. * * Since the number of full reservations is computed without acquiring the * free page queue lock, the returned value may be inexact. */ static int sysctl_vm_reserv_fullpop(SYSCTL_HANDLER_ARGS) { vm_paddr_t paddr; struct vm_phys_seg *seg; vm_reserv_t rv; int fullpop, segind; fullpop = 0; for (segind = 0; segind < vm_phys_nsegs; segind++) { seg = &vm_phys_segs[segind]; paddr = roundup2(seg->start, VM_LEVEL_0_SIZE); while (paddr + VM_LEVEL_0_SIZE <= seg->end) { rv = &vm_reserv_array[paddr >> VM_LEVEL_0_SHIFT]; fullpop += rv->popcnt == VM_LEVEL_0_NPAGES; paddr += VM_LEVEL_0_SIZE; } } return (sysctl_handle_int(oidp, &fullpop, 0, req)); } /* * Describes the current state of the partially-populated reservation queue. */ static int sysctl_vm_reserv_partpopq(SYSCTL_HANDLER_ARGS) { struct sbuf sbuf; vm_reserv_t rv; int counter, error, level, unused_pages; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sbuf_new_for_sysctl(&sbuf, NULL, 128, req); sbuf_printf(&sbuf, "\nLEVEL SIZE NUMBER\n\n"); for (level = -1; level <= VM_NRESERVLEVEL - 2; level++) { counter = 0; unused_pages = 0; mtx_lock(&vm_page_queue_free_mtx); TAILQ_FOREACH(rv, &vm_rvq_partpop/*[level]*/, partpopq) { counter++; unused_pages += VM_LEVEL_0_NPAGES - rv->popcnt; } mtx_unlock(&vm_page_queue_free_mtx); sbuf_printf(&sbuf, "%5d: %6dK, %6d\n", level, unused_pages * ((int)PAGE_SIZE / 1024), counter); } error = sbuf_finish(&sbuf); sbuf_delete(&sbuf); return (error); } /* * Reduces the given reservation's population count. If the population count * becomes zero, the reservation is destroyed. Additionally, moves the * reservation to the tail of the partially-populated reservation queue if the * population count is non-zero. * * The free page queue lock must be held. */ static void vm_reserv_depopulate(vm_reserv_t rv, int index) { mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); KASSERT(rv->object != NULL, ("vm_reserv_depopulate: reserv %p is free", rv)); KASSERT(popmap_is_set(rv->popmap, index), ("vm_reserv_depopulate: reserv %p's popmap[%d] is clear", rv, index)); KASSERT(rv->popcnt > 0, ("vm_reserv_depopulate: reserv %p's popcnt is corrupted", rv)); if (rv->inpartpopq) { TAILQ_REMOVE(&vm_rvq_partpop, rv, partpopq); rv->inpartpopq = FALSE; } else { KASSERT(rv->pages->psind == 1, ("vm_reserv_depopulate: reserv %p is already demoted", rv)); rv->pages->psind = 0; } popmap_clear(rv->popmap, index); rv->popcnt--; if (rv->popcnt == 0) { LIST_REMOVE(rv, objq); rv->object = NULL; vm_phys_free_pages(rv->pages, VM_LEVEL_0_ORDER); vm_reserv_freed++; } else { rv->inpartpopq = TRUE; TAILQ_INSERT_TAIL(&vm_rvq_partpop, rv, partpopq); } } /* * Returns the reservation to which the given page might belong. */ static __inline vm_reserv_t vm_reserv_from_page(vm_page_t m) { return (&vm_reserv_array[VM_PAGE_TO_PHYS(m) >> VM_LEVEL_0_SHIFT]); } /* * Returns TRUE if the given reservation contains the given page index and * FALSE otherwise. */ static __inline boolean_t vm_reserv_has_pindex(vm_reserv_t rv, vm_pindex_t pindex) { return (((pindex - rv->pindex) & ~(VM_LEVEL_0_NPAGES - 1)) == 0); } /* * Increases the given reservation's population count. Moves the reservation * to the tail of the partially-populated reservation queue. * * The free page queue must be locked. */ static void vm_reserv_populate(vm_reserv_t rv, int index) { mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); KASSERT(rv->object != NULL, ("vm_reserv_populate: reserv %p is free", rv)); KASSERT(popmap_is_clear(rv->popmap, index), ("vm_reserv_populate: reserv %p's popmap[%d] is set", rv, index)); KASSERT(rv->popcnt < VM_LEVEL_0_NPAGES, ("vm_reserv_populate: reserv %p is already full", rv)); KASSERT(rv->pages->psind == 0, ("vm_reserv_populate: reserv %p is already promoted", rv)); if (rv->inpartpopq) { TAILQ_REMOVE(&vm_rvq_partpop, rv, partpopq); rv->inpartpopq = FALSE; } popmap_set(rv->popmap, index); rv->popcnt++; if (rv->popcnt < VM_LEVEL_0_NPAGES) { rv->inpartpopq = TRUE; TAILQ_INSERT_TAIL(&vm_rvq_partpop, rv, partpopq); } else rv->pages->psind = 1; } /* * Allocates a contiguous set of physical pages of the given size "npages" * from existing or newly created reservations. All of the physical pages * must be at or above the given physical address "low" and below the given * physical address "high". The given value "alignment" determines the * alignment of the first physical page in the set. If the given value * "boundary" is non-zero, then the set of physical pages cannot cross any * physical address boundary that is a multiple of that value. Both * "alignment" and "boundary" must be a power of two. * * The object and free page queue must be locked. */ vm_page_t vm_reserv_alloc_contig(vm_object_t object, vm_pindex_t pindex, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary) { vm_paddr_t pa, size; vm_page_t m, m_ret, mpred, msucc; vm_pindex_t first, leftcap, rightcap; vm_reserv_t rv; u_long allocpages, maxpages, minpages; int i, index, n; mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(npages != 0, ("vm_reserv_alloc_contig: npages is 0")); /* * Is a reservation fundamentally impossible? */ if (pindex < VM_RESERV_INDEX(object, pindex) || pindex + npages > object->size) return (NULL); /* * All reservations of a particular size have the same alignment. * Assuming that the first page is allocated from a reservation, the * least significant bits of its physical address can be determined * from its offset from the beginning of the reservation and the size * of the reservation. * * Could the specified index within a reservation of the smallest * possible size satisfy the alignment and boundary requirements? */ pa = VM_RESERV_INDEX(object, pindex) << PAGE_SHIFT; if ((pa & (alignment - 1)) != 0) return (NULL); size = npages << PAGE_SHIFT; if (((pa ^ (pa + size - 1)) & ~(boundary - 1)) != 0) return (NULL); /* * Look for an existing reservation. */ mpred = vm_radix_lookup_le(&object->rtree, pindex); if (mpred != NULL) { KASSERT(mpred->pindex < pindex, ("vm_reserv_alloc_contig: pindex already allocated")); rv = vm_reserv_from_page(mpred); if (rv->object == object && vm_reserv_has_pindex(rv, pindex)) goto found; msucc = TAILQ_NEXT(mpred, listq); } else msucc = TAILQ_FIRST(&object->memq); if (msucc != NULL) { KASSERT(msucc->pindex > pindex, ("vm_reserv_alloc_contig: pindex already allocated")); rv = vm_reserv_from_page(msucc); if (rv->object == object && vm_reserv_has_pindex(rv, pindex)) goto found; } /* * Could at least one reservation fit between the first index to the * left that can be used ("leftcap") and the first index to the right * that cannot be used ("rightcap")? */ first = pindex - VM_RESERV_INDEX(object, pindex); if (mpred != NULL) { if ((rv = vm_reserv_from_page(mpred))->object != object) leftcap = mpred->pindex + 1; else leftcap = rv->pindex + VM_LEVEL_0_NPAGES; if (leftcap > first) return (NULL); } minpages = VM_RESERV_INDEX(object, pindex) + npages; maxpages = roundup2(minpages, VM_LEVEL_0_NPAGES); allocpages = maxpages; if (msucc != NULL) { if ((rv = vm_reserv_from_page(msucc))->object != object) rightcap = msucc->pindex; else rightcap = rv->pindex; if (first + maxpages > rightcap) { if (maxpages == VM_LEVEL_0_NPAGES) return (NULL); /* * At least one reservation will fit between "leftcap" * and "rightcap". However, a reservation for the * last of the requested pages will not fit. Reduce * the size of the upcoming allocation accordingly. */ allocpages = minpages; } } /* * Would the last new reservation extend past the end of the object? */ if (first + maxpages > object->size) { /* * Don't allocate the last new reservation if the object is a * vnode or backed by another object that is a vnode. */ if (object->type == OBJT_VNODE || (object->backing_object != NULL && object->backing_object->type == OBJT_VNODE)) { if (maxpages == VM_LEVEL_0_NPAGES) return (NULL); allocpages = minpages; } /* Speculate that the object may grow. */ } /* * Allocate the physical pages. The alignment and boundary specified * for this allocation may be different from the alignment and * boundary specified for the requested pages. For instance, the * specified index may not be the first page within the first new * reservation. */ m = vm_phys_alloc_contig(allocpages, low, high, ulmax(alignment, VM_LEVEL_0_SIZE), boundary > VM_LEVEL_0_SIZE ? boundary : 0); if (m == NULL) return (NULL); /* * The allocated physical pages always begin at a reservation * boundary, but they do not always end at a reservation boundary. * Initialize every reservation that is completely covered by the * allocated physical pages. */ m_ret = NULL; index = VM_RESERV_INDEX(object, pindex); do { rv = vm_reserv_from_page(m); KASSERT(rv->pages == m, ("vm_reserv_alloc_contig: reserv %p's pages is corrupted", rv)); KASSERT(rv->object == NULL, ("vm_reserv_alloc_contig: reserv %p isn't free", rv)); LIST_INSERT_HEAD(&object->rvq, rv, objq); rv->object = object; rv->pindex = first; KASSERT(rv->popcnt == 0, ("vm_reserv_alloc_contig: reserv %p's popcnt is corrupted", rv)); KASSERT(!rv->inpartpopq, ("vm_reserv_alloc_contig: reserv %p's inpartpopq is TRUE", rv)); for (i = 0; i < NPOPMAP; i++) KASSERT(rv->popmap[i] == 0, ("vm_reserv_alloc_contig: reserv %p's popmap is corrupted", rv)); n = ulmin(VM_LEVEL_0_NPAGES - index, npages); for (i = 0; i < n; i++) vm_reserv_populate(rv, index + i); npages -= n; if (m_ret == NULL) { m_ret = &rv->pages[index]; index = 0; } m += VM_LEVEL_0_NPAGES; first += VM_LEVEL_0_NPAGES; allocpages -= VM_LEVEL_0_NPAGES; } while (allocpages >= VM_LEVEL_0_NPAGES); return (m_ret); /* * Found a matching reservation. */ found: index = VM_RESERV_INDEX(object, pindex); /* Does the allocation fit within the reservation? */ if (index + npages > VM_LEVEL_0_NPAGES) return (NULL); m = &rv->pages[index]; pa = VM_PAGE_TO_PHYS(m); if (pa < low || pa + size > high || (pa & (alignment - 1)) != 0 || ((pa ^ (pa + size - 1)) & ~(boundary - 1)) != 0) return (NULL); /* Handle vm_page_rename(m, new_object, ...). */ for (i = 0; i < npages; i++) if (popmap_is_set(rv->popmap, index + i)) return (NULL); for (i = 0; i < npages; i++) vm_reserv_populate(rv, index + i); return (m); } /* * Allocates a page from an existing or newly-created reservation. * * The page "mpred" must immediately precede the offset "pindex" within the * specified object. * * The object and free page queue must be locked. */ vm_page_t vm_reserv_alloc_page(vm_object_t object, vm_pindex_t pindex, vm_page_t mpred) { vm_page_t m, msucc; vm_pindex_t first, leftcap, rightcap; vm_reserv_t rv; int i, index; mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); VM_OBJECT_ASSERT_WLOCKED(object); /* * Is a reservation fundamentally impossible? */ if (pindex < VM_RESERV_INDEX(object, pindex) || pindex >= object->size) return (NULL); /* * Look for an existing reservation. */ if (mpred != NULL) { KASSERT(mpred->object == object, ("vm_reserv_alloc_page: object doesn't contain mpred")); KASSERT(mpred->pindex < pindex, ("vm_reserv_alloc_page: mpred doesn't precede pindex")); rv = vm_reserv_from_page(mpred); if (rv->object == object && vm_reserv_has_pindex(rv, pindex)) goto found; msucc = TAILQ_NEXT(mpred, listq); } else msucc = TAILQ_FIRST(&object->memq); if (msucc != NULL) { KASSERT(msucc->pindex > pindex, ("vm_reserv_alloc_page: msucc doesn't succeed pindex")); rv = vm_reserv_from_page(msucc); if (rv->object == object && vm_reserv_has_pindex(rv, pindex)) goto found; } /* * Could a reservation fit between the first index to the left that * can be used and the first index to the right that cannot be used? */ first = pindex - VM_RESERV_INDEX(object, pindex); if (mpred != NULL) { if ((rv = vm_reserv_from_page(mpred))->object != object) leftcap = mpred->pindex + 1; else leftcap = rv->pindex + VM_LEVEL_0_NPAGES; if (leftcap > first) return (NULL); } if (msucc != NULL) { if ((rv = vm_reserv_from_page(msucc))->object != object) rightcap = msucc->pindex; else rightcap = rv->pindex; if (first + VM_LEVEL_0_NPAGES > rightcap) return (NULL); } /* * Would a new reservation extend past the end of the object? */ if (first + VM_LEVEL_0_NPAGES > object->size) { /* * Don't allocate a new reservation if the object is a vnode or * backed by another object that is a vnode. */ if (object->type == OBJT_VNODE || (object->backing_object != NULL && object->backing_object->type == OBJT_VNODE)) return (NULL); /* Speculate that the object may grow. */ } /* * Allocate and populate the new reservation. */ m = vm_phys_alloc_pages(VM_FREEPOOL_DEFAULT, VM_LEVEL_0_ORDER); if (m == NULL) return (NULL); rv = vm_reserv_from_page(m); KASSERT(rv->pages == m, ("vm_reserv_alloc_page: reserv %p's pages is corrupted", rv)); KASSERT(rv->object == NULL, ("vm_reserv_alloc_page: reserv %p isn't free", rv)); LIST_INSERT_HEAD(&object->rvq, rv, objq); rv->object = object; rv->pindex = first; KASSERT(rv->popcnt == 0, ("vm_reserv_alloc_page: reserv %p's popcnt is corrupted", rv)); KASSERT(!rv->inpartpopq, ("vm_reserv_alloc_page: reserv %p's inpartpopq is TRUE", rv)); for (i = 0; i < NPOPMAP; i++) KASSERT(rv->popmap[i] == 0, ("vm_reserv_alloc_page: reserv %p's popmap is corrupted", rv)); index = VM_RESERV_INDEX(object, pindex); vm_reserv_populate(rv, index); return (&rv->pages[index]); /* * Found a matching reservation. */ found: index = VM_RESERV_INDEX(object, pindex); m = &rv->pages[index]; /* Handle vm_page_rename(m, new_object, ...). */ if (popmap_is_set(rv->popmap, index)) return (NULL); vm_reserv_populate(rv, index); return (m); } /* * Breaks the given reservation. Except for the specified cached or free * page, all cached and free pages in the reservation are returned to the * physical memory allocator. The reservation's population count and map are * reset to their initial state. * * The given reservation must not be in the partially-populated reservation * queue. The free page queue lock must be held. */ static void vm_reserv_break(vm_reserv_t rv, vm_page_t m) { int begin_zeroes, hi, i, lo; mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); KASSERT(rv->object != NULL, ("vm_reserv_break: reserv %p is free", rv)); KASSERT(!rv->inpartpopq, ("vm_reserv_break: reserv %p's inpartpopq is TRUE", rv)); LIST_REMOVE(rv, objq); rv->object = NULL; if (m != NULL) { /* * Since the reservation is being broken, there is no harm in * abusing the population map to stop "m" from being returned * to the physical memory allocator. */ i = m - rv->pages; KASSERT(popmap_is_clear(rv->popmap, i), ("vm_reserv_break: reserv %p's popmap is corrupted", rv)); popmap_set(rv->popmap, i); rv->popcnt++; } i = hi = 0; do { /* Find the next 0 bit. Any previous 0 bits are < "hi". */ lo = ffsl(~(((1UL << hi) - 1) | rv->popmap[i])); if (lo == 0) { /* Redundantly clears bits < "hi". */ rv->popmap[i] = 0; rv->popcnt -= NBPOPMAP - hi; while (++i < NPOPMAP) { lo = ffsl(~rv->popmap[i]); if (lo == 0) { rv->popmap[i] = 0; rv->popcnt -= NBPOPMAP; } else break; } if (i == NPOPMAP) break; hi = 0; } KASSERT(lo > 0, ("vm_reserv_break: lo is %d", lo)); /* Convert from ffsl() to ordinary bit numbering. */ lo--; if (lo > 0) { /* Redundantly clears bits < "hi". */ rv->popmap[i] &= ~((1UL << lo) - 1); rv->popcnt -= lo - hi; } begin_zeroes = NBPOPMAP * i + lo; /* Find the next 1 bit. */ do hi = ffsl(rv->popmap[i]); while (hi == 0 && ++i < NPOPMAP); if (i != NPOPMAP) /* Convert from ffsl() to ordinary bit numbering. */ hi--; vm_phys_free_contig(&rv->pages[begin_zeroes], NBPOPMAP * i + hi - begin_zeroes); } while (i < NPOPMAP); KASSERT(rv->popcnt == 0, ("vm_reserv_break: reserv %p's popcnt is corrupted", rv)); vm_reserv_broken++; } /* * Breaks all reservations belonging to the given object. */ void vm_reserv_break_all(vm_object_t object) { vm_reserv_t rv; mtx_lock(&vm_page_queue_free_mtx); while ((rv = LIST_FIRST(&object->rvq)) != NULL) { KASSERT(rv->object == object, ("vm_reserv_break_all: reserv %p is corrupted", rv)); if (rv->inpartpopq) { TAILQ_REMOVE(&vm_rvq_partpop, rv, partpopq); rv->inpartpopq = FALSE; } vm_reserv_break(rv, NULL); } mtx_unlock(&vm_page_queue_free_mtx); } /* * Frees the given page if it belongs to a reservation. Returns TRUE if the * page is freed and FALSE otherwise. * * The free page queue lock must be held. */ boolean_t vm_reserv_free_page(vm_page_t m) { vm_reserv_t rv; mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); rv = vm_reserv_from_page(m); if (rv->object == NULL) return (FALSE); vm_reserv_depopulate(rv, m - rv->pages); return (TRUE); } /* * Initializes the reservation management system. Specifically, initializes * the reservation array. * * Requires that vm_page_array and first_page are initialized! */ void vm_reserv_init(void) { vm_paddr_t paddr; struct vm_phys_seg *seg; int segind; /* * Initialize the reservation array. Specifically, initialize the * "pages" field for every element that has an underlying superpage. */ for (segind = 0; segind < vm_phys_nsegs; segind++) { seg = &vm_phys_segs[segind]; paddr = roundup2(seg->start, VM_LEVEL_0_SIZE); while (paddr + VM_LEVEL_0_SIZE <= seg->end) { vm_reserv_array[paddr >> VM_LEVEL_0_SHIFT].pages = PHYS_TO_VM_PAGE(paddr); paddr += VM_LEVEL_0_SIZE; } } } /* + * Returns true if the given page belongs to a reservation and that page is + * free. Otherwise, returns false. + */ +bool +vm_reserv_is_page_free(vm_page_t m) +{ + vm_reserv_t rv; + + mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); + rv = vm_reserv_from_page(m); + if (rv->object == NULL) + return (false); + return (popmap_is_clear(rv->popmap, m - rv->pages)); +} + +/* + * If the given page belongs to a reservation, returns the level of that + * reservation. Otherwise, returns -1. + */ +int +vm_reserv_level(vm_page_t m) +{ + vm_reserv_t rv; + + rv = vm_reserv_from_page(m); + return (rv->object != NULL ? 0 : -1); +} + +/* * Returns a reservation level if the given page belongs to a fully-populated * reservation and -1 otherwise. */ int vm_reserv_level_iffullpop(vm_page_t m) { vm_reserv_t rv; rv = vm_reserv_from_page(m); return (rv->popcnt == VM_LEVEL_0_NPAGES ? 0 : -1); } /* * Prepare for the reactivation of a cached page. * * First, suppose that the given page "m" was allocated individually, i.e., not * as part of a reservation, and cached. Then, suppose a reservation * containing "m" is allocated by the same object. Although "m" and the * reservation belong to the same object, "m"'s pindex may not match the * reservation's. * * The free page queue must be locked. */ boolean_t vm_reserv_reactivate_page(vm_page_t m) { vm_reserv_t rv; int index; mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); rv = vm_reserv_from_page(m); if (rv->object == NULL) return (FALSE); KASSERT((m->flags & PG_CACHED) != 0, ("vm_reserv_reactivate_page: page %p is not cached", m)); if (m->object == rv->object && m->pindex - rv->pindex == (index = VM_RESERV_INDEX(m->object, m->pindex))) vm_reserv_populate(rv, index); else { KASSERT(rv->inpartpopq, ("vm_reserv_reactivate_page: reserv %p's inpartpopq is FALSE", rv)); TAILQ_REMOVE(&vm_rvq_partpop, rv, partpopq); rv->inpartpopq = FALSE; /* Don't release "m" to the physical memory allocator. */ vm_reserv_break(rv, m); } return (TRUE); } /* * Breaks the given partially-populated reservation, releasing its cached and * free pages to the physical memory allocator. * * The free page queue lock must be held. */ static void vm_reserv_reclaim(vm_reserv_t rv) { mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); KASSERT(rv->inpartpopq, ("vm_reserv_reclaim: reserv %p's inpartpopq is FALSE", rv)); TAILQ_REMOVE(&vm_rvq_partpop, rv, partpopq); rv->inpartpopq = FALSE; vm_reserv_break(rv, NULL); vm_reserv_reclaimed++; } /* * Breaks the reservation at the head of the partially-populated reservation * queue, releasing its cached and free pages to the physical memory * allocator. Returns TRUE if a reservation is broken and FALSE otherwise. * * The free page queue lock must be held. */ boolean_t vm_reserv_reclaim_inactive(void) { vm_reserv_t rv; mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); if ((rv = TAILQ_FIRST(&vm_rvq_partpop)) != NULL) { vm_reserv_reclaim(rv); return (TRUE); } return (FALSE); } /* * Searches the partially-populated reservation queue for the least recently * active reservation with unused pages, i.e., cached or free, that satisfy the * given request for contiguous physical memory. If a satisfactory reservation * is found, it is broken. Returns TRUE if a reservation is broken and FALSE * otherwise. * * The free page queue lock must be held. */ boolean_t vm_reserv_reclaim_contig(u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary) { vm_paddr_t pa, size; vm_reserv_t rv; int hi, i, lo, low_index, next_free; mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); if (npages > VM_LEVEL_0_NPAGES - 1) return (FALSE); size = npages << PAGE_SHIFT; TAILQ_FOREACH(rv, &vm_rvq_partpop, partpopq) { pa = VM_PAGE_TO_PHYS(&rv->pages[VM_LEVEL_0_NPAGES - 1]); if (pa + PAGE_SIZE - size < low) { /* This entire reservation is too low; go to next. */ continue; } pa = VM_PAGE_TO_PHYS(&rv->pages[0]); if (pa + size > high) { /* This entire reservation is too high; go to next. */ continue; } if (pa < low) { /* Start the search for free pages at "low". */ low_index = (low + PAGE_MASK - pa) >> PAGE_SHIFT; i = low_index / NBPOPMAP; hi = low_index % NBPOPMAP; } else i = hi = 0; do { /* Find the next free page. */ lo = ffsl(~(((1UL << hi) - 1) | rv->popmap[i])); while (lo == 0 && ++i < NPOPMAP) lo = ffsl(~rv->popmap[i]); if (i == NPOPMAP) break; /* Convert from ffsl() to ordinary bit numbering. */ lo--; next_free = NBPOPMAP * i + lo; pa = VM_PAGE_TO_PHYS(&rv->pages[next_free]); KASSERT(pa >= low, ("vm_reserv_reclaim_contig: pa is too low")); if (pa + size > high) { /* The rest of this reservation is too high. */ break; } else if ((pa & (alignment - 1)) != 0 || ((pa ^ (pa + size - 1)) & ~(boundary - 1)) != 0) { /* * The current page doesn't meet the alignment * and/or boundary requirements. Continue * searching this reservation until the rest * of its free pages are either excluded or * exhausted. */ hi = lo + 1; if (hi >= NBPOPMAP) { hi = 0; i++; } continue; } /* Find the next used page. */ hi = ffsl(rv->popmap[i] & ~((1UL << lo) - 1)); while (hi == 0 && ++i < NPOPMAP) { if ((NBPOPMAP * i - next_free) * PAGE_SIZE >= size) { vm_reserv_reclaim(rv); return (TRUE); } hi = ffsl(rv->popmap[i]); } /* Convert from ffsl() to ordinary bit numbering. */ if (i != NPOPMAP) hi--; if ((NBPOPMAP * i + hi - next_free) * PAGE_SIZE >= size) { vm_reserv_reclaim(rv); return (TRUE); } } while (i < NPOPMAP); } return (FALSE); } /* * Transfers the reservation underlying the given page to a new object. * * The object must be locked. */ void vm_reserv_rename(vm_page_t m, vm_object_t new_object, vm_object_t old_object, vm_pindex_t old_object_offset) { vm_reserv_t rv; VM_OBJECT_ASSERT_WLOCKED(new_object); rv = vm_reserv_from_page(m); if (rv->object == old_object) { mtx_lock(&vm_page_queue_free_mtx); if (rv->object == old_object) { LIST_REMOVE(rv, objq); LIST_INSERT_HEAD(&new_object->rvq, rv, objq); rv->object = new_object; rv->pindex -= old_object_offset; } mtx_unlock(&vm_page_queue_free_mtx); + } +} + +/* + * Returns the size (in bytes) of a reservation of the specified level. + */ +int +vm_reserv_size(int level) +{ + + switch (level) { + case 0: + return (VM_LEVEL_0_SIZE); + case -1: + return (PAGE_SIZE); + default: + return (0); } } /* * Allocates the virtual and physical memory required by the reservation * management system's data structures, in particular, the reservation array. */ vm_paddr_t vm_reserv_startup(vm_offset_t *vaddr, vm_paddr_t end, vm_paddr_t high_water) { vm_paddr_t new_end; size_t size; /* * Calculate the size (in bytes) of the reservation array. Round up * from "high_water" because every small page is mapped to an element * in the reservation array based on its physical address. Thus, the * number of elements in the reservation array can be greater than the * number of superpages. */ size = howmany(high_water, VM_LEVEL_0_SIZE) * sizeof(struct vm_reserv); /* * Allocate and map the physical memory for the reservation array. The * next available virtual address is returned by reference. */ new_end = end - round_page(size); vm_reserv_array = (void *)(uintptr_t)pmap_map(vaddr, new_end, end, VM_PROT_READ | VM_PROT_WRITE); bzero(vm_reserv_array, size); /* * Return the next available physical address. */ return (new_end); } #endif /* VM_NRESERVLEVEL > 0 */ Index: head/sys/vm/vm_reserv.h =================================================================== --- head/sys/vm/vm_reserv.h (revision 292468) +++ head/sys/vm/vm_reserv.h (revision 292469) @@ -1,68 +1,71 @@ /*- * Copyright (c) 2002-2006 Rice University * Copyright (c) 2007-2008 Alan L. Cox * All rights reserved. * * This software was developed for the FreeBSD Project by Alan L. Cox, * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ /* * Superpage reservation management definitions */ #ifndef _VM_RESERV_H_ #define _VM_RESERV_H_ #ifdef _KERNEL #if VM_NRESERVLEVEL > 0 /* * The following functions are only to be used by the virtual memory system. */ vm_page_t vm_reserv_alloc_contig(vm_object_t object, vm_pindex_t pindex, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary); vm_page_t vm_reserv_alloc_page(vm_object_t object, vm_pindex_t pindex, vm_page_t mpred); void vm_reserv_break_all(vm_object_t object); boolean_t vm_reserv_free_page(vm_page_t m); void vm_reserv_init(void); +bool vm_reserv_is_page_free(vm_page_t m); +int vm_reserv_level(vm_page_t m); int vm_reserv_level_iffullpop(vm_page_t m); boolean_t vm_reserv_reactivate_page(vm_page_t m); boolean_t vm_reserv_reclaim_contig(u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary); boolean_t vm_reserv_reclaim_inactive(void); void vm_reserv_rename(vm_page_t m, vm_object_t new_object, vm_object_t old_object, vm_pindex_t old_object_offset); +int vm_reserv_size(int level); vm_paddr_t vm_reserv_startup(vm_offset_t *vaddr, vm_paddr_t end, vm_paddr_t high_water); #endif /* VM_NRESERVLEVEL > 0 */ #endif /* _KERNEL */ #endif /* !_VM_RESERV_H_ */