Index: sys/kern/subr_pctrie.c =================================================================== --- sys/kern/subr_pctrie.c +++ sys/kern/subr_pctrie.c @@ -63,7 +63,6 @@ #endif #define PCTRIE_MASK (PCTRIE_COUNT - 1) -#define PCTRIE_LIMIT (howmany(sizeof(uint64_t) * NBBY, PCTRIE_WIDTH) - 1) #if PCTRIE_WIDTH == 3 typedef uint8_t pn_popmap_t; @@ -738,42 +737,25 @@ } #endif -/* - * Remove the specified index from the tree, and return the value stored at - * that index. If the index is not present, return NULL. - */ -uint64_t * -pctrie_remove_lookup(struct pctrie *ptree, uint64_t index, - struct pctrie_node **freenode) +static void +pctrie_remove(struct pctrie *ptree, uint64_t index, struct pctrie_node *parent, + struct pctrie_node *node, struct pctrie_node **freenode) { - struct pctrie_node *child, *node, *parent; - uint64_t *m; + struct pctrie_node *child; int slot; - *freenode = node = NULL; - child = pctrie_root_load(ptree, NULL, PCTRIE_LOCKED); - for (;;) { - if (pctrie_isleaf(child)) - break; - parent = node; - node = child; - slot = pctrie_slot(node, index); - child = pctrie_node_load(&node->pn_child[slot], NULL, - PCTRIE_LOCKED); - } - if ((m = pctrie_toval(child)) == NULL || *m != index) - return (NULL); if (node == NULL) { pctrie_root_store(ptree, PCTRIE_NULL, PCTRIE_LOCKED); - return (m); + return; } + slot = pctrie_slot(node, index); KASSERT((node->pn_popmap & (1 << slot)) != 0, ("%s: bad popmap slot %d in node %p", __func__, slot, node)); node->pn_popmap ^= 1 << slot; pctrie_node_store(&node->pn_child[slot], PCTRIE_NULL, PCTRIE_LOCKED); if (!powerof2(node->pn_popmap)) - return (m); + return; KASSERT(node->pn_popmap != 0, ("%s: bad popmap all zeroes", __func__)); slot = ffs(node->pn_popmap) - 1; child = pctrie_node_load(&node->pn_child[slot], NULL, PCTRIE_LOCKED); @@ -795,6 +777,33 @@ */ pctrie_node_put(node); *freenode = node; +} + +/* + * Remove the specified index from the tree, and return the value stored at + * that index. If the index is not present, return NULL. + */ +uint64_t * +pctrie_remove_lookup(struct pctrie *ptree, uint64_t index, + struct pctrie_node **freenode) +{ + struct pctrie_node *child, *node, *parent; + uint64_t *m; + int slot; + + parent = (struct pctrie_node *)0xdeadbeef; + *freenode = node = NULL; + child = pctrie_root_load(ptree, NULL, PCTRIE_LOCKED); + while (!pctrie_isleaf(child)) { + parent = node; + node = child; + slot = pctrie_slot(node, index); + child = pctrie_node_load(&node->pn_child[slot], NULL, + PCTRIE_LOCKED); + } + if ((m = pctrie_toval(child)) == NULL || *m != index) + return (NULL); + pctrie_remove(ptree, index, parent, node, freenode); return (m); } @@ -901,6 +910,163 @@ callback, keyoff, arg)); } +/* + * Find first leaf >= index, and fill iter with the path to the parent of that + * leaf. + */ +static uint64_t * +pctrie_iter_index(struct pctrie_iter *iter, struct pctrie_node *node, + uint64_t index) +{ + uint64_t *m; + int slot; + + /* Seek a node that matches index. */ + while (!pctrie_isleaf(node) && + !pctrie_keybarr(node, index, &slot)) { + iter->path[iter->top++] = node; + KASSERT(iter->top <= nitems(iter->path), + ("%s: path overflow in trie %p", __func__, iter->ptree)); + node = pctrie_node_load(&node->pn_child[slot], NULL, + PCTRIE_LOCKED); + } + + /* + * If no such node was found, and instead this path leads only to nodes + * < index, back up to find a subtrie with the least value > index. + */ + if (pctrie_isleaf(node) ? + (m = pctrie_toval(node)) == NULL || *m < index : + node->pn_owner < index) { + /* Climb the path to find a node with a descendant > index. */ + do { + if (iter->top == 0) { + MPASS(pctrie_lookup_ge(iter->ptree, index) == + NULL); + return (NULL); + } + node = iter->path[--iter->top]; + slot = pctrie_slot(node, index) + 1; + } while ((node->pn_popmap >> slot) == 0); + + /* Step to the least child with a descendant > index. */ + slot += ffs(node->pn_popmap >> slot) - 1; + iter->top++; + node = pctrie_node_load(&node->pn_child[slot], NULL, + PCTRIE_LOCKED); + } + /* Descend to the least leaf of the subtrie. */ + while (!pctrie_isleaf(node)) { + slot = ffs(node->pn_popmap) - 1; + iter->path[iter->top++] = node; + KASSERT(iter->top <= nitems(iter->path), + ("%s: path overflow in trie %p", __func__, iter->ptree)); + node = pctrie_node_load(&node->pn_child[slot], NULL, + PCTRIE_LOCKED); + } + m = pctrie_toval(node); + iter->index = *m; + MPASS(pctrie_lookup_ge(iter->ptree, index) == m); + return (m); +} + +/* + * Find the first leaf with value at least 'step' greater than the previous + * leaf. + */ +uint64_t * +pctrie_iter_step(struct pctrie_iter *iter, uint64_t step) +{ + struct pctrie_node *node; + uint64_t index = iter->index + step; + int slot; + + /* Detect step overflow. */ + if (index < iter->index) + return (NULL); + + if (iter->top == 0) { + uint64_t *m; + node = pctrie_root_load(iter->ptree, NULL, PCTRIE_UNSERIALIZED); + if (!pctrie_isleaf(node)) + /* + * If there have been insertions since the last + * iteration, the root may not be a leaf; in that case, + * put the root on the path. + */ + iter->path[iter->top++] = node; + else if ((m = pctrie_toval(node)) != NULL && *m >= index) { + /* If the root is the only value in the trie and it is + * larger than the current index value, it was promoted + * to root via the removal of another, and should be + * considered as the last iteration. + */ + iter->index = *m; + MPASS(pctrie_lookup_ge(iter->ptree, index) == m); + return (m); + } + } + + /* Climb the path to find a node that matches a prefix of index. */ + do { + if (iter->top == 0) + return (NULL); + node = iter->path[--iter->top]; + } while (pctrie_keybarr(node, index, &slot)); + + /* Complete the search for index from node. */ + iter->top++; + node = pctrie_node_load(&node->pn_child[slot], NULL, PCTRIE_LOCKED); + return (pctrie_iter_index(iter, node, index)); +} + +/* + * Find first leaf >= index, and initialize iter with the path to the parent of + * that leaf. + */ +uint64_t * +pctrie_iter_start(struct pctrie_iter *iter, + struct pctrie *ptree, uint64_t index) +{ + struct pctrie_node *node = pctrie_root_load(ptree, NULL, PCTRIE_LOCKED); + + iter->ptree = ptree; + iter->top = 0; + return (pctrie_iter_index(iter, node, index)); +} + +/* + * Remove from the trie the leaf last chosen by the iterator, and + * adjust the path if it's last member is to be freed. + */ +uint64_t * +pctrie_iter_remove(struct pctrie_iter *iter, struct pctrie_node **freenode) +{ + struct pctrie_node *child, *node, *parent; + uint64_t *m; + int slot; + + *freenode = NULL; + parent = (struct pctrie_node *)0xdeadbeef; + if (iter->top >= 1) { + if (iter->top >= 2) + parent = iter->path[iter->top - 2]; + node = iter->path[iter->top - 1]; + slot = pctrie_slot(node, iter->index); + child = pctrie_node_load(&node->pn_child[slot], NULL, + PCTRIE_LOCKED); + } else { + node = NULL; + child = pctrie_root_load(iter->ptree, NULL, PCTRIE_LOCKED); + } + if ((m = pctrie_toval(child)) == NULL || *m != iter->index) + return (NULL); + pctrie_remove(iter->ptree, iter->index, parent, node, freenode); + if (*freenode != NULL) + --iter->top; + return (m); +} + /* * Replace an existing value in the trie with another one. * Panics if there is not an old value in the trie at the new value's index. Index: sys/kern/subr_rangeset.c =================================================================== --- sys/kern/subr_rangeset.c +++ sys/kern/subr_rangeset.c @@ -272,8 +272,8 @@ int rangeset_copy(struct rangeset *dst_rs, struct rangeset *src_rs) { + struct pctrie_iter iter; struct rs_el *src_r, *dst_r; - uint64_t cursor; int error; MPASS(pctrie_is_empty(&dst_rs->rs_trie)); @@ -281,10 +281,8 @@ MPASS(dst_rs->rs_dup_data == src_rs->rs_dup_data); error = 0; - for (cursor = 0;; cursor = src_r->re_start + 1) { - src_r = RANGESET_PCTRIE_LOOKUP_GE(&src_rs->rs_trie, cursor); - if (src_r == NULL) - break; + for (src_r = RANGESET_PCTRIE_ITER_START(&iter, &src_rs->rs_trie, 0); + src_r != NULL; src_r = RANGESET_PCTRIE_ITER_STEP(&iter, 1)) { dst_r = dst_rs->rs_dup_data(dst_rs->rs_data_ctx, src_r); if (dst_r == NULL) { error = ENOMEM; @@ -303,13 +301,11 @@ static void rangeset_check(struct rangeset *rs) { + struct pctrie_iter iter; struct rs_el *r, *rp; - uint64_t cursor; - for (cursor = 0, rp = NULL;; cursor = r->re_start + 1, rp = r) { - r = RANGESET_PCTRIE_LOOKUP_GE(&rs->rs_trie, cursor); - if (r == NULL) - break; + for (rp = NULL, r = RANGESET_PCTRIE_ITER_START(&iter, &rs->rs_trie, 0); + r != NULL; rp = r, r = RANGESET_PCTRIE_ITER_STEP(&iter, 1)) { KASSERT(r->re_start < r->re_end, ("invalid interval rs %p elem %p (%#jx, %#jx)", rs, r, (uintmax_t)r->re_start, (uintmax_t)r->re_end)); @@ -332,9 +328,9 @@ DB_SHOW_COMMAND(rangeset, rangeset_show_fn) { + struct pctrie_iter iter; struct rangeset *rs; struct rs_el *r; - uint64_t cursor; if (!have_addr) { db_printf("show rangeset addr\n"); @@ -343,10 +339,8 @@ rs = (struct rangeset *)addr; db_printf("rangeset %p\n", rs); - for (cursor = 0;; cursor = r->re_start + 1) { - r = RANGESET_PCTRIE_LOOKUP_GE(&rs->rs_trie, cursor); - if (r == NULL) - break; + for (r = RANGESET_PCTRIE_ITER_START(&iter, &rs->rs_trie, 0); + r != NULL; r = RANGESET_PCTRIE_ITER_STEP(&iter, 1)) { db_printf(" el %p start %#jx end %#jx\n", r, r->re_start, r->re_end); } Index: sys/sys/pctrie.h =================================================================== --- sys/sys/pctrie.h +++ sys/sys/pctrie.h @@ -240,6 +240,34 @@ } \ \ static __inline __unused struct type * \ +name##_PCTRIE_ITER_STEP(struct pctrie_iter *iter, uint64_t step) \ +{ \ + return name##_PCTRIE_VAL2PTR( \ + pctrie_iter_step(iter, step)); \ +} \ + \ +static __inline __unused struct type * \ +name##_PCTRIE_ITER_START(struct pctrie_iter *iter, \ + struct pctrie *ptree, uint64_t index) \ +{ \ + return name##_PCTRIE_VAL2PTR( \ + pctrie_iter_start(iter, ptree, index)); \ +} \ + \ +static __inline __unused void \ +name##_PCTRIE_ITER_REMOVE(struct pctrie_iter *iter) \ +{ \ + uint64_t *val; \ + struct pctrie_node *freenode; \ + \ + val = pctrie_iter_remove(iter, &freenode); \ + if (val == NULL) \ + panic("%s: key not found", __func__); \ + if (freenode != NULL) \ + freefn(iter->ptree, freenode); \ +} \ + \ +static __inline __unused struct type * \ name##_PCTRIE_REPLACE(struct pctrie *ptree, struct type *ptr) \ { \ \ @@ -295,6 +323,12 @@ pctrie_cb_t callback, int keyoff, void *arg); struct pctrie_node *pctrie_reclaim_resume_cb(struct pctrie_node **pnode, pctrie_cb_t callback, int keyoff, void *arg); +struct pctrie_iter; +uint64_t *pctrie_iter_step(struct pctrie_iter *iter, uint64_t step); +uint64_t *pctrie_iter_start(struct pctrie_iter *iter, + struct pctrie *ptree, uint64_t index); +uint64_t *pctrie_iter_remove(struct pctrie_iter *iter, + struct pctrie_node **freenode); uint64_t *pctrie_remove_lookup(struct pctrie *ptree, uint64_t index, struct pctrie_node **killnode); uint64_t *pctrie_replace(struct pctrie *ptree, uint64_t *newval); @@ -342,6 +376,14 @@ #endif #define PCTRIE_COUNT (1 << PCTRIE_WIDTH) +#define PCTRIE_LIMIT (howmany(sizeof(uint64_t) * NBBY, PCTRIE_WIDTH) - 1) + +struct pctrie_iter { + struct pctrie_node *path[PCTRIE_LIMIT]; + struct pctrie *ptree; + uint64_t index; + int top; +}; #endif /* _KERNEL */ #endif /* !_SYS_PCTRIE_H_ */ Index: sys/vm/swap_pager.c =================================================================== --- sys/vm/swap_pager.c +++ sys/vm/swap_pager.c @@ -491,7 +491,7 @@ static void swp_pager_meta_free(vm_object_t, vm_pindex_t, vm_pindex_t, vm_size_t *); static void swp_pager_meta_transfer(vm_object_t src, vm_object_t dst, - vm_pindex_t pindex, vm_pindex_t count, vm_size_t *freed); + vm_pindex_t pindex, vm_pindex_t count); static void swp_pager_meta_free_all(vm_object_t); static daddr_t swp_pager_meta_lookup(vm_object_t, vm_pindex_t); @@ -1084,8 +1084,7 @@ /* * Transfer source to destination. */ - swp_pager_meta_transfer(srcobject, dstobject, offset, dstobject->size, - NULL); + swp_pager_meta_transfer(srcobject, dstobject, offset, dstobject->size); /* * Free left over swap blocks in source. @@ -1776,8 +1775,8 @@ u_long swap_pager_swapped_pages(vm_object_t object) { + struct pctrie_iter iter; struct swblk *sb; - vm_pindex_t pi; u_long res; int i; @@ -1786,9 +1785,10 @@ if (pctrie_is_empty(&object->un_pager.swp.swp_blks)) return (0); - for (res = 0, pi = 0; (sb = SWAP_PCTRIE_LOOKUP_GE( - &object->un_pager.swp.swp_blks, pi)) != NULL; - pi = sb->p + SWAP_META_PAGES) { + res = 0; + for (sb = SWAP_PCTRIE_ITER_START(&iter, + &object->un_pager.swp.swp_blks, 0); sb != NULL; + sb = SWAP_PCTRIE_ITER_STEP(&iter, SWAP_META_PAGES)) { for (i = 0; i < SWAP_META_PAGES; i++) { if (sb->d[i] != SWAPBLK_NONE) res++; @@ -1806,6 +1806,7 @@ static void swap_pager_swapoff_object(struct swdevt *sp, vm_object_t object) { + struct pctrie_iter iter; struct page_range range; struct swblk *sb; vm_page_t m; @@ -1822,29 +1823,28 @@ i = 0; swp_pager_init_freerange(&range); for (;;) { - if (i == 0 && (object->flags & OBJ_DEAD) != 0) { - /* - * Make sure that pending writes finish before - * returning. - */ - vm_object_pip_wait(object, "swpoff"); - swp_pager_meta_free_all(object); - break; - } - - if (i == SWAP_META_PAGES) { + if (i == 0) { + if ((object->flags & OBJ_DEAD) != 0) { + /* + * Make sure that pending writes finish before + * returning. + */ + vm_object_pip_wait(object, "swpoff"); + swp_pager_meta_free_all(object); + break; + } + sb = SWAP_PCTRIE_ITER_START(&iter, + &object->un_pager.swp.swp_blks, pi); + } else if (i == SWAP_META_PAGES) { pi = sb->p + SWAP_META_PAGES; if (sb_empty) { - SWAP_PCTRIE_REMOVE( - &object->un_pager.swp.swp_blks, sb->p); + SWAP_PCTRIE_ITER_REMOVE(&iter); uma_zfree(swblk_zone, sb); } + sb = SWAP_PCTRIE_ITER_STEP(&iter, SWAP_META_PAGES); i = 0; } - if (i == 0) { - sb = SWAP_PCTRIE_LOOKUP_GE( - &object->un_pager.swp.swp_blks, pi); if (sb == NULL) break; sb_empty = true; @@ -2142,31 +2142,27 @@ } /* - * SWP_PAGER_META_TRANSFER() - free a range of blocks in the srcobject's swap - * metadata, or transfer it into dstobject. + * SWP_PAGER_META_TRANSFER() - transfer a range of blocks in the srcobject's + * swap metadata into dstobject. * * This routine will free swap metadata structures as they are cleaned * out. */ static void swp_pager_meta_transfer(vm_object_t srcobject, vm_object_t dstobject, - vm_pindex_t pindex, vm_pindex_t count, vm_size_t *moved) + vm_pindex_t pindex, vm_pindex_t count) { struct page_range range; struct swblk *sb; daddr_t blk; - vm_page_t m; vm_pindex_t offset, last; - vm_size_t mc; int i, limit, start; VM_OBJECT_ASSERT_WLOCKED(srcobject); - MPASS(moved == NULL || dstobject == NULL); + VM_OBJECT_ASSERT_WLOCKED(dstobject); - mc = 0; - m = NULL; if (count == 0 || pctrie_is_empty(&srcobject->un_pager.swp.swp_blks)) - goto out; + return; swp_pager_init_freerange(&range); offset = pindex; @@ -2180,13 +2176,11 @@ limit = last - sb->p < SWAP_META_PAGES ? last - sb->p : SWAP_META_PAGES; for (i = start; i < limit; i++) { - blk = sb->d[i]; - if (blk == SWAPBLK_NONE) + if (sb->d[i] == SWAPBLK_NONE) continue; - if (dstobject == NULL || - (blk = swp_pager_meta_build(dstobject, - sb->p + i - offset, blk, true), - blk != sb->d[i] && blk != SWAPBLK_NONE)) + blk = swp_pager_meta_build(dstobject, + sb->p + i - offset, sb->d[i], true); + if (blk != sb->d[i] && blk != SWAPBLK_NONE) swp_pager_update_freerange(&range, sb->d[i]); else if (blk == sb->d[i]) { /* @@ -2197,17 +2191,9 @@ */ VM_OBJECT_WUNLOCK(srcobject); swp_pager_meta_build(dstobject, - sb->p + i - offset, blk, false); + sb->p + i - offset, sb->d[i], false); VM_OBJECT_WLOCK(srcobject); } - if (moved != NULL) { - if (m != NULL && m->pindex != pindex + i - 1) - m = NULL; - m = m != NULL ? vm_page_next(m) : - vm_page_lookup(srcobject, pindex + i); - if (m == NULL || vm_page_none_valid(m)) - mc++; - } sb->d[i] = SWAPBLK_NONE; } pindex = sb->p + SWAP_META_PAGES; @@ -2219,9 +2205,6 @@ } } swp_pager_freeswapspace(&range); -out: - if (moved != NULL) - *moved = mc; } /* @@ -2238,7 +2221,54 @@ swp_pager_meta_free(vm_object_t object, vm_pindex_t pindex, vm_pindex_t count, vm_size_t *freed) { - swp_pager_meta_transfer(object, NULL, pindex, count, freed); + struct pctrie_iter iter; + struct page_range range; + struct swblk *sb; + vm_page_t m; + vm_pindex_t last; + vm_size_t fc; + int i, limit, start; + + VM_OBJECT_ASSERT_WLOCKED(object); + + fc = 0; + m = NULL; + if (count == 0 || pctrie_is_empty(&object->un_pager.swp.swp_blks)) + goto out; + + swp_pager_init_freerange(&range); + last = pindex + count; + for (sb = SWAP_PCTRIE_ITER_START(&iter, &object->un_pager.swp.swp_blks, + rounddown(pindex, SWAP_META_PAGES)); sb != NULL && sb->p < last; + sb = SWAP_PCTRIE_ITER_STEP(&iter, SWAP_META_PAGES)) { + start = pindex > sb->p ? pindex - sb->p : 0; + limit = last - sb->p < SWAP_META_PAGES ? last - sb->p : + SWAP_META_PAGES; + for (i = start; i < limit; i++) { + if (sb->d[i] == SWAPBLK_NONE) + continue; + swp_pager_update_freerange(&range, sb->d[i]); + if (freed != NULL) { + if (m != NULL && m->pindex != pindex + i - 1) + m = NULL; + m = m != NULL ? vm_page_next(m) : + vm_page_lookup(object, pindex + i); + if (m == NULL || vm_page_none_valid(m)) + fc++; + } + sb->d[i] = SWAPBLK_NONE; + } + pindex = sb->p + SWAP_META_PAGES; + if (swp_pager_swblk_empty(sb, 0, start) && + swp_pager_swblk_empty(sb, limit, SWAP_META_PAGES)) { + SWAP_PCTRIE_ITER_REMOVE(&iter); + uma_zfree(swblk_zone, sb); + } + } + swp_pager_freeswapspace(&range); +out: + if (freed != NULL) + *freed = fc; } static void @@ -2314,6 +2344,7 @@ vm_pindex_t swap_pager_find_least(vm_object_t object, vm_pindex_t pindex) { + struct pctrie_iter iter; struct swblk *sb; int i; @@ -2322,7 +2353,7 @@ if (pctrie_is_empty(&object->un_pager.swp.swp_blks)) return (object->size); - sb = SWAP_PCTRIE_LOOKUP_GE(&object->un_pager.swp.swp_blks, + sb = SWAP_PCTRIE_ITER_START(&iter, &object->un_pager.swp.swp_blks, rounddown(pindex, SWAP_META_PAGES)); if (sb == NULL) return (object->size); @@ -2331,8 +2362,7 @@ if (sb->d[i] != SWAPBLK_NONE) return (sb->p + i); } - sb = SWAP_PCTRIE_LOOKUP_GE(&object->un_pager.swp.swp_blks, - roundup(pindex, SWAP_META_PAGES)); + sb = SWAP_PCTRIE_ITER_STEP(&iter, SWAP_META_PAGES); if (sb == NULL) return (object->size); } @@ -2776,6 +2806,8 @@ long vmspace_swap_count(struct vmspace *vmspace) { + struct pctrie_iter iter; + struct pctrie *ptree; vm_map_t map; vm_map_entry_t cur; vm_object_t object; @@ -2796,13 +2828,12 @@ VM_OBJECT_RLOCK(object); if ((object->flags & OBJ_SWAP) == 0) goto unlock; + ptree = &object->un_pager.swp.swp_blks; pi = OFF_TO_IDX(cur->offset); e = pi + OFF_TO_IDX(cur->end - cur->start); - for (;; pi = sb->p + SWAP_META_PAGES) { - sb = SWAP_PCTRIE_LOOKUP_GE( - &object->un_pager.swp.swp_blks, pi); - if (sb == NULL || sb->p >= e) - break; + for (sb = SWAP_PCTRIE_ITER_START(&iter, ptree, pi); + sb != NULL && sb->p < e; + sb = SWAP_PCTRIE_ITER_STEP(&iter, SWAP_META_PAGES)) { for (i = 0; i < SWAP_META_PAGES; i++) { if (sb->p + i < e && sb->d[i] != SWAPBLK_NONE)