diff --git a/sys/kern/kern_rangelock.c b/sys/kern/kern_rangelock.c --- a/sys/kern/kern_rangelock.c +++ b/sys/kern/kern_rangelock.c @@ -2,7 +2,11 @@ * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2009 Konstantin Belousov - * All rights reserved. + * Copyright (c) 2023 The FreeBSD Foundation + * + * Portions of this software were developed by + * Konstantin Belousov under sponsorship from + * the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -27,304 +31,699 @@ */ #include +#include #include #include #include #include #include -#include +#include +#include +#include #include +/* + * Immediately after initialization (subject to 'rangelock_cheat' + * below), and until a request comes that conflicts with granted ones + * based on type, rangelocks serve requests in the "cheating" mode. + * In this mode, a rangelock behaves like a sxlock, as if each request + * covered the whole range of the protected object. On receiving a + * conflicting request (any request while a write request is + * effective, or any write request while some read ones are + * effective), all requests granted in "cheating" mode are drained, + * and the rangelock then switches to effectively keeping track of the + * precise range of each new request. + * + * Normal sx implementation is not used to not bloat structures (most + * important, vnodes) which embeds rangelocks. + * + * The cheating greatly helps very common pattern where file is first + * written single-threaded, and then read by many processes. + * + * Lock is in cheat mode when RL_CHEAT_CHEATING bit is set in the + * lock->head. Special cookies are returned in this mode, and + * trylocks are same as normal locks but do not drain. + */ + +static int rangelock_cheat = 1; +SYSCTL_INT(_debug, OID_AUTO, rangelock_cheat, CTLFLAG_RWTUN, + &rangelock_cheat, 0, + ""); + +#define RL_CHEAT_MASK 0x7 +#define RL_CHEAT_CHEATING 0x1 +/* #define RL_CHEAT_RLOCKED 0x0 */ +#define RL_CHEAT_WLOCKED 0x2 +#define RL_CHEAT_DRAINING 0x4 + +#define RL_CHEAT_READER 0x8 + +#define RL_RET_CHEAT_RLOCKED 0x1100 +#define RL_RET_CHEAT_WLOCKED 0x2200 + +static bool +rangelock_cheat_lock(struct rangelock *lock, int locktype, bool trylock, + void **cookie) +{ + uintptr_t v, x; + + v = atomic_load_ptr(&lock->head); + if ((v & RL_CHEAT_CHEATING) == 0) + return (false); + if ((v & RL_CHEAT_DRAINING) != 0) { +drain: + if (trylock) { + *cookie = NULL; + return (true); + } + sleepq_lock(&lock->head); +drain1: + DROP_GIANT(); + for (;;) { + v = atomic_load_ptr(&lock->head); + if ((v & RL_CHEAT_DRAINING) == 0) + break; + sleepq_add(&lock->head, NULL, "ranged1", 0, 0); + sleepq_wait(&lock->head, PRI_USER); + sleepq_lock(&lock->head); + } + sleepq_release(&lock->head); + PICKUP_GIANT(); + return (false); + } + + switch (locktype) { + case RL_LOCK_READ: + for (;;) { + if ((v & RL_CHEAT_WLOCKED) != 0) { + if (trylock) { + *cookie = NULL; + return (true); + } + x = v | RL_CHEAT_DRAINING; + sleepq_lock(&lock->head); + if (atomic_fcmpset_rel_ptr(&lock->head, &v, + x) != 0) + goto drain1; + sleepq_release(&lock->head); + /* Possibly forgive passed conflict */ + continue; + } + x = (v & ~RL_CHEAT_MASK) + RL_CHEAT_READER; + x |= RL_CHEAT_CHEATING; + if (atomic_fcmpset_acq_ptr(&lock->head, &v, x) != 0) + break; + if ((v & RL_CHEAT_CHEATING) == 0) + return (false); + if ((v & RL_CHEAT_DRAINING) != 0) + goto drain; + } + *(uintptr_t *)cookie = RL_RET_CHEAT_RLOCKED; + break; + case RL_LOCK_WRITE: + for (;;) { + if ((v & ~RL_CHEAT_MASK) >= RL_CHEAT_READER || + (v & RL_CHEAT_WLOCKED) != 0) { + if (trylock) { + *cookie = NULL; + return (true); + } + x = v | RL_CHEAT_DRAINING; + sleepq_lock(&lock->head); + if (atomic_fcmpset_rel_ptr(&lock->head, &v, + x) != 0) + goto drain1; + sleepq_release(&lock->head); + /* Possibly forgive passed conflict */ + continue; + } + x = RL_CHEAT_WLOCKED | RL_CHEAT_CHEATING; + if (atomic_fcmpset_acq_ptr(&lock->head, &v, x) != 0) + break; + if ((v & RL_CHEAT_CHEATING) == 0) + return (false); + if ((v & RL_CHEAT_DRAINING) != 0) + goto drain; + } + *(uintptr_t *)cookie = RL_RET_CHEAT_WLOCKED; + break; + default: + __assert_unreachable(); + break; + } + return (true); +} + +static bool +rangelock_cheat_unlock(struct rangelock *lock, void *cookie) +{ + uintptr_t v, x; + + v = atomic_load_ptr(&lock->head); + if ((v & RL_CHEAT_CHEATING) == 0) + return (false); + + MPASS((uintptr_t)cookie == RL_RET_CHEAT_WLOCKED || + (uintptr_t)cookie == RL_RET_CHEAT_RLOCKED); + + switch ((uintptr_t)cookie) { + case RL_RET_CHEAT_RLOCKED: + for (;;) { + MPASS((v & ~RL_CHEAT_MASK) >= RL_CHEAT_READER); + MPASS((v & RL_CHEAT_WLOCKED) == 0); + x = (v & ~RL_CHEAT_MASK) - RL_CHEAT_READER; + if ((v & RL_CHEAT_DRAINING) != 0) { + if (x != 0) { + x |= RL_CHEAT_DRAINING | + RL_CHEAT_CHEATING; + if (atomic_fcmpset_rel_ptr(&lock->head, + &v, x) != 0) + break; + } else { + sleepq_lock(&lock->head); + if (atomic_fcmpset_rel_ptr(&lock->head, + &v, x) != 0) { + sleepq_broadcast( + &lock->head, + SLEEPQ_SLEEP, 0, 0); + sleepq_release(&lock->head); + break; + } + sleepq_release(&lock->head); + } + } else { + x |= RL_CHEAT_CHEATING; + if (atomic_fcmpset_rel_ptr(&lock->head, &v, + x) != 0) + break; + } + } + break; + case RL_RET_CHEAT_WLOCKED: + for (;;) { + MPASS((v & RL_CHEAT_WLOCKED) != 0); + if ((v & RL_CHEAT_DRAINING) != 0) { + sleepq_lock(&lock->head); + atomic_store_ptr(&lock->head, 0); + sleepq_broadcast(&lock->head, + SLEEPQ_SLEEP, 0, 0); + sleepq_release(&lock->head); + break; + } else { + if (atomic_fcmpset_ptr(&lock->head, &v, + RL_CHEAT_CHEATING) != 0) + break; + } + } + break; + default: + __assert_unreachable(); + break; + } + return (true); +} + +static bool +rangelock_cheat_destroy(struct rangelock *lock) +{ + uintptr_t v; + + v = atomic_load_ptr(&lock->head); + if ((v & RL_CHEAT_CHEATING) == 0) + return (false); + MPASS(v == RL_CHEAT_CHEATING); + return (true); +} + +/* + * Implementation of range locks based on the paper + * https://doi.org/10.1145/3342195.3387533 + * arXiv:2006.12144v1 [cs.OS] 22 Jun 2020 + * Scalable Range Locks for Scalable Address Spaces and Beyond + * by Alex Kogan, Dave Dice, and Shady Issa + */ + +static struct rl_q_entry *rl_e_unmark(const struct rl_q_entry *e); + +/* + * rl_q_next links all granted ranges in the lock. We cannot free an + * rl_q_entry while in the smr section, and cannot reuse rl_q_next + * linkage since other threads might follow it even after CAS removed + * the range. Use rl_q_free for local list of ranges to remove after + * the smr section is dropped. + */ struct rl_q_entry { - TAILQ_ENTRY(rl_q_entry) rl_q_link; + struct rl_q_entry *rl_q_next; + struct rl_q_entry *rl_q_free; off_t rl_q_start, rl_q_end; int rl_q_flags; +#ifdef INVARIANTS + struct thread *rl_q_owner; +#endif }; static uma_zone_t rl_entry_zone; +static smr_t rl_smr; static void rangelock_sys_init(void) { - rl_entry_zone = uma_zcreate("rl_entry", sizeof(struct rl_q_entry), - NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); + NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct rl_q_entry), + UMA_ZONE_SMR); + rl_smr = uma_zone_get_smr(rl_entry_zone); } -SYSINIT(vfs, SI_SUB_LOCK, SI_ORDER_ANY, rangelock_sys_init, NULL); +SYSINIT(rl, SI_SUB_LOCK, SI_ORDER_ANY, rangelock_sys_init, NULL); static struct rl_q_entry * -rlqentry_alloc(void) +rlqentry_alloc(vm_ooffset_t start, vm_ooffset_t end, int flags) { + struct rl_q_entry *e; + struct thread *td; - return (uma_zalloc(rl_entry_zone, M_WAITOK)); + td = curthread; + if (td->td_rlqe != NULL) { + e = td->td_rlqe; + td->td_rlqe = NULL; + } else { + e = uma_zalloc_smr(rl_entry_zone, M_WAITOK); + } + e->rl_q_next = NULL; + e->rl_q_free = NULL; + e->rl_q_start = start; + e->rl_q_end = end; + e->rl_q_flags = flags; +#ifdef INVARIANTS + e->rl_q_owner = curthread; +#endif + return (e); } void -rlqentry_free(struct rl_q_entry *rleq) +rangelock_entry_free(struct rl_q_entry *e) { - - uma_zfree(rl_entry_zone, rleq); + uma_zfree_smr(rl_entry_zone, e); } void rangelock_init(struct rangelock *lock) { - - TAILQ_INIT(&lock->rl_waiters); - lock->rl_currdep = NULL; + lock->sleepers = false; + atomic_store_ptr(&lock->head, rangelock_cheat ? RL_CHEAT_CHEATING : 0); } void rangelock_destroy(struct rangelock *lock) { + struct rl_q_entry *e, *ep; + + MPASS(!lock->sleepers); + if (rangelock_cheat_destroy(lock)) + return; + for (e = (struct rl_q_entry *)atomic_load_ptr(&lock->head); + e != NULL; e = rl_e_unmark(ep)) { + ep = atomic_load_ptr(&e->rl_q_next); + uma_zfree_smr(rl_entry_zone, e); + } +} - KASSERT(TAILQ_EMPTY(&lock->rl_waiters), ("Dangling waiters")); +static bool +rl_e_is_marked(const struct rl_q_entry *e) +{ + return (((uintptr_t)e & 1) != 0); } -/* - * Two entries are compatible if their ranges do not overlap, or both - * entries are for read. - */ -static int -ranges_overlap(const struct rl_q_entry *e1, - const struct rl_q_entry *e2) +static struct rl_q_entry * +rl_e_unmark_unchecked(const struct rl_q_entry *e) { + return ((struct rl_q_entry *)((uintptr_t)e & ~1)); +} - if (e1->rl_q_start < e2->rl_q_end && e1->rl_q_end > e2->rl_q_start) - return (1); - return (0); +static struct rl_q_entry * +rl_e_unmark(const struct rl_q_entry *e) +{ + MPASS(rl_e_is_marked(e)); + return (rl_e_unmark_unchecked(e)); } -/* - * Recalculate the lock->rl_currdep after an unlock. - */ static void -rangelock_calc_block(struct rangelock *lock) +rl_e_mark(struct rl_q_entry *e) { - struct rl_q_entry *entry, *nextentry, *entry1; - - for (entry = lock->rl_currdep; entry != NULL; entry = nextentry) { - nextentry = TAILQ_NEXT(entry, rl_q_link); - if (entry->rl_q_flags & RL_LOCK_READ) { - /* Reads must not overlap with granted writes. */ - for (entry1 = TAILQ_FIRST(&lock->rl_waiters); - !(entry1->rl_q_flags & RL_LOCK_READ); - entry1 = TAILQ_NEXT(entry1, rl_q_link)) { - if (ranges_overlap(entry, entry1)) - goto out; - } - } else { - /* Write must not overlap with any granted locks. */ - for (entry1 = TAILQ_FIRST(&lock->rl_waiters); - entry1 != entry; - entry1 = TAILQ_NEXT(entry1, rl_q_link)) { - if (ranges_overlap(entry, entry1)) - goto out; - } +#ifdef INVARIANTS + int r = atomic_testandset_long((uintptr_t *)&e->rl_q_next, 0); + MPASS(r == 0); +#else + atomic_set_ptr((uintptr_t *)&e->rl_q_next, 1); +#endif +} - /* Move grantable write locks to the front. */ - TAILQ_REMOVE(&lock->rl_waiters, entry, rl_q_link); - TAILQ_INSERT_HEAD(&lock->rl_waiters, entry, rl_q_link); - } +static struct rl_q_entry * +rl_q_load(struct rl_q_entry **p) +{ + return ((struct rl_q_entry *)atomic_load_acq_ptr((uintptr_t *)p)); +} - /* Grant this lock. */ - entry->rl_q_flags |= RL_LOCK_GRANTED; - wakeup(entry); - } -out: - lock->rl_currdep = entry; +static bool +rl_e_is_rlock(const struct rl_q_entry *e) +{ + return ((e->rl_q_flags & RL_LOCK_TYPE_MASK) == RL_LOCK_READ); } static void -rangelock_unlock_locked(struct rangelock *lock, struct rl_q_entry *entry, - struct mtx *ilk, bool do_calc_block) +rangelock_unlock_int(struct rangelock *lock, struct rl_q_entry *e) { + bool sleepers; - MPASS(lock != NULL && entry != NULL && ilk != NULL); - mtx_assert(ilk, MA_OWNED); - - if (!do_calc_block) { - /* - * This is the case where rangelock_enqueue() has been called - * with trylock == true and just inserted this entry in the - * queue. - * If rl_currdep is this entry, rl_currdep needs to - * be set to the next entry in the rl_waiters list. - * However, since this entry is the last entry in the - * list, the next entry is NULL. - */ - if (lock->rl_currdep == entry) { - KASSERT(TAILQ_NEXT(lock->rl_currdep, rl_q_link) == NULL, - ("rangelock_enqueue: next entry not NULL")); - lock->rl_currdep = NULL; - } - } else - KASSERT(entry != lock->rl_currdep, ("stuck currdep")); - - TAILQ_REMOVE(&lock->rl_waiters, entry, rl_q_link); - if (do_calc_block) - rangelock_calc_block(lock); - mtx_unlock(ilk); - if (curthread->td_rlqe == NULL) - curthread->td_rlqe = entry; - else - rlqentry_free(entry); + MPASS(lock != NULL && e != NULL); + MPASS(!rl_e_is_marked(rl_q_load(&e->rl_q_next))); + MPASS(e->rl_q_owner == curthread); + + rl_e_mark(e); + sleepers = lock->sleepers; + lock->sleepers = false; + if (sleepers) + sleepq_broadcast(&lock->sleepers, SLEEPQ_SLEEP, 0, 0); } void -rangelock_unlock(struct rangelock *lock, void *cookie, struct mtx *ilk) +rangelock_unlock(struct rangelock *lock, void *cookie) { + if (rangelock_cheat_unlock(lock, cookie)) + return; - MPASS(lock != NULL && cookie != NULL && ilk != NULL); - - mtx_lock(ilk); - rangelock_unlock_locked(lock, cookie, ilk, true); + sleepq_lock(&lock->sleepers); + rangelock_unlock_int(lock, cookie); + sleepq_release(&lock->sleepers); } /* - * Unlock the sub-range of granted lock. + * result: -1 if e1 before e2, or both locks are readers and e1 + * starts before or at e2 + * 1 if e1 after e2, or both locks are readers and e1 + * starts after e2 + * 0 if e1 and e2 overlap and at least one lock is writer */ -void * -rangelock_unlock_range(struct rangelock *lock, void *cookie, off_t start, - off_t end, struct mtx *ilk) +static int +rl_e_compare(const struct rl_q_entry *e1, const struct rl_q_entry *e2) { - struct rl_q_entry *entry; - - MPASS(lock != NULL && cookie != NULL && ilk != NULL); - entry = cookie; - KASSERT(entry->rl_q_flags & RL_LOCK_GRANTED, - ("Unlocking non-granted lock")); - KASSERT(entry->rl_q_start == start, ("wrong start")); - KASSERT(entry->rl_q_end >= end, ("wrong end")); - - mtx_lock(ilk); - if (entry->rl_q_end == end) { - rangelock_unlock_locked(lock, cookie, ilk, true); - return (NULL); - } - entry->rl_q_end = end; - rangelock_calc_block(lock); - mtx_unlock(ilk); - return (cookie); + bool rds; + + if (e1 == NULL) + return (1); + if (e2->rl_q_start >= e1->rl_q_end) + return (-1); + rds = rl_e_is_rlock(e1) && rl_e_is_rlock(e2); + if (e2->rl_q_start >= e1->rl_q_start && rds) + return (-1); + if (e1->rl_q_start >= e2->rl_q_end) + return (1); + if (e1->rl_q_start >= e2->rl_q_start && rds) + return (1); + return (0); } -/* - * Add the lock request to the queue of the pending requests for - * rangelock. Sleep until the request can be granted unless trylock == true. - */ -static void * -rangelock_enqueue(struct rangelock *lock, off_t start, off_t end, int mode, - struct mtx *ilk, bool trylock) +static void +rl_insert_sleep(struct rangelock *lock) { - struct rl_q_entry *entry; - struct thread *td; + smr_exit(rl_smr); + DROP_GIANT(); + lock->sleepers = true; + sleepq_add(&lock->sleepers, NULL, "rangelk", 0, 0); + sleepq_wait(&lock->sleepers, PRI_USER); + PICKUP_GIANT(); + smr_enter(rl_smr); +} - MPASS(lock != NULL && ilk != NULL); +static bool +rl_q_cas(struct rl_q_entry **prev, struct rl_q_entry *old, + struct rl_q_entry *new) +{ + return (atomic_cmpset_rel_ptr((uintptr_t *)prev, (uintptr_t)old, + (uintptr_t)new) != 0); +} - td = curthread; - if (td->td_rlqe != NULL) { - entry = td->td_rlqe; - td->td_rlqe = NULL; - } else - entry = rlqentry_alloc(); - MPASS(entry != NULL); - entry->rl_q_flags = mode; - entry->rl_q_start = start; - entry->rl_q_end = end; - - mtx_lock(ilk); - /* - * XXXKIB TODO. Check that a thread does not try to enqueue a - * lock that is incompatible with another request from the same - * thread. - */ - - TAILQ_INSERT_TAIL(&lock->rl_waiters, entry, rl_q_link); - /* - * If rl_currdep == NULL, there is no entry waiting for a conflicting - * range to be resolved, so set rl_currdep to this entry. If there is - * no conflicting entry for this entry, rl_currdep will be set back to - * NULL by rangelock_calc_block(). - */ - if (lock->rl_currdep == NULL) - lock->rl_currdep = entry; - rangelock_calc_block(lock); - while (!(entry->rl_q_flags & RL_LOCK_GRANTED)) { +enum RL_INSERT_RES { + RL_TRYLOCK_FAILED, + RL_LOCK_SUCCESS, + RL_LOCK_RETRY, +}; + +static enum RL_INSERT_RES +rl_r_validate(struct rangelock *lock, struct rl_q_entry *e, bool trylock, + struct rl_q_entry **free) +{ + struct rl_q_entry *cur, *next, **prev; + + prev = &e->rl_q_next; + cur = rl_q_load(prev); + MPASS(!rl_e_is_marked(cur)); /* nobody can unlock e yet */ + for (;;) { + if (cur == NULL || cur->rl_q_start > e->rl_q_end) + return (RL_LOCK_SUCCESS); + next = rl_q_load(&cur->rl_q_next); + if (rl_e_is_marked(next)) { + next = rl_e_unmark(next); + if (rl_q_cas(prev, cur, next)) { + cur->rl_q_free = *free; + *free = cur; + } + cur = next; + continue; + } + if (rl_e_is_rlock(cur)) { + prev = &cur->rl_q_next; + cur = rl_e_unmark_unchecked(rl_q_load(prev)); + continue; + } + if (!rl_e_is_marked(rl_q_load(&cur->rl_q_next))) { + sleepq_lock(&lock->sleepers); + if (rl_e_is_marked(rl_q_load(&cur->rl_q_next))) { + sleepq_release(&lock->sleepers); + continue; + } + rangelock_unlock_int(lock, e); + if (trylock) { + sleepq_release(&lock->sleepers); + return (RL_TRYLOCK_FAILED); + } + rl_insert_sleep(lock); + return (RL_LOCK_RETRY); + } + } +} + +static enum RL_INSERT_RES +rl_w_validate(struct rangelock *lock, struct rl_q_entry *e, + bool trylock, struct rl_q_entry **free) +{ + struct rl_q_entry *cur, *next, **prev; + + prev = (struct rl_q_entry **)&lock->head; + cur = rl_q_load(prev); + MPASS(!rl_e_is_marked(cur)); /* head is not marked */ + for (;;) { + if (cur == e) + return (RL_LOCK_SUCCESS); + next = rl_q_load(&cur->rl_q_next); + if (rl_e_is_marked(next)) { + next = rl_e_unmark(next); + if (rl_q_cas(prev, cur, next)) { + cur->rl_q_next = *free; + *free = cur; + } + cur = next; + continue; + } + if (cur->rl_q_end <= e->rl_q_start) { + prev = &cur->rl_q_next; + cur = rl_e_unmark_unchecked(rl_q_load(prev)); + continue; + } + sleepq_lock(&lock->sleepers); + rangelock_unlock_int(lock, e); if (trylock) { - /* - * For this case, the range is not actually locked - * yet, but removal from the list requires the same - * steps, except for not doing a rangelock_calc_block() - * call, since rangelock_calc_block() was called above. - */ - rangelock_unlock_locked(lock, entry, ilk, false); - return (NULL); + sleepq_release(&lock->sleepers); + return (RL_TRYLOCK_FAILED); } - msleep(entry, ilk, 0, "range", 0); + rl_insert_sleep(lock); + return (RL_LOCK_RETRY); } - mtx_unlock(ilk); - return (entry); } -void * -rangelock_rlock(struct rangelock *lock, off_t start, off_t end, struct mtx *ilk) +static enum RL_INSERT_RES +rl_insert(struct rangelock *lock, struct rl_q_entry *e, bool trylock, + struct rl_q_entry **free) { + struct rl_q_entry *cur, *next, **prev; + int r; + +again: + prev = (struct rl_q_entry **)&lock->head; + cur = rl_q_load(prev); + if (cur == NULL && rl_q_cas(prev, NULL, e)) + return (RL_LOCK_SUCCESS); + + for (;;) { + if (cur != NULL) { + if (rl_e_is_marked(cur)) + goto again; + + next = rl_q_load(&cur->rl_q_next); + if (rl_e_is_marked(next)) { + next = rl_e_unmark(next); + if (rl_q_cas(prev, cur, next)) { +#ifdef INVARIANTS + cur->rl_q_owner = NULL; +#endif + cur->rl_q_free = *free; + *free = cur; + } + cur = next; + continue; + } + } - return (rangelock_enqueue(lock, start, end, RL_LOCK_READ, ilk, false)); + r = rl_e_compare(cur, e); + if (r == -1) { + prev = &cur->rl_q_next; + cur = rl_q_load(prev); + } else if (r == 0) { + sleepq_lock(&lock->sleepers); + if (__predict_false(rl_e_is_marked(rl_q_load( + &cur->rl_q_next)))) { + sleepq_release(&lock->sleepers); + continue; + } + if (trylock) { + sleepq_release(&lock->sleepers); + return (RL_TRYLOCK_FAILED); + } + rl_insert_sleep(lock); + /* e is still valid */ + goto again; + } else /* r == 1 */ { + e->rl_q_next = cur; + if (rl_q_cas(prev, cur, e)) { + atomic_thread_fence_acq(); + return (rl_e_is_rlock(e) ? + rl_r_validate(lock, e, trylock, free) : + rl_w_validate(lock, e, trylock, free)); + } + /* Reset rl_q_next in case we hit fast path. */ + e->rl_q_next = NULL; + cur = rl_q_load(prev); + } + } } -void * -rangelock_tryrlock(struct rangelock *lock, off_t start, off_t end, - struct mtx *ilk) +static struct rl_q_entry * +rangelock_lock_int(struct rangelock *lock, bool trylock, vm_ooffset_t start, + vm_ooffset_t end, int locktype) { + struct rl_q_entry *e, *free, *x, *xp; + struct thread *td; + void *cookie; + enum RL_INSERT_RES res; - return (rangelock_enqueue(lock, start, end, RL_LOCK_READ, ilk, true)); + if (rangelock_cheat_lock(lock, locktype, trylock, &cookie)) + return (cookie); + td = curthread; + for (res = RL_LOCK_RETRY; res == RL_LOCK_RETRY;) { + free = NULL; + e = rlqentry_alloc(start, end, locktype); + smr_enter(rl_smr); + res = rl_insert(lock, e, trylock, &free); + smr_exit(rl_smr); + if (res == RL_TRYLOCK_FAILED) { + MPASS(trylock); + e->rl_q_free = free; + free = e; + e = NULL; + } + for (x = free; x != NULL; x = xp) { + MPASS(!rl_e_is_marked(x)); + xp = x->rl_q_free; + MPASS(!rl_e_is_marked(xp)); + if (td->td_rlqe == NULL) { + smr_synchronize(rl_smr); + td->td_rlqe = x; + } else { + uma_zfree_smr(rl_entry_zone, x); + } + } + } + return (e); } void * -rangelock_wlock(struct rangelock *lock, off_t start, off_t end, struct mtx *ilk) +rangelock_rlock(struct rangelock *lock, vm_ooffset_t start, vm_ooffset_t end) { + return (rangelock_lock_int(lock, false, start, end, RL_LOCK_READ)); +} - return (rangelock_enqueue(lock, start, end, RL_LOCK_WRITE, ilk, false)); +void * +rangelock_tryrlock(struct rangelock *lock, vm_ooffset_t start, vm_ooffset_t end) +{ + return (rangelock_lock_int(lock, true, start, end, RL_LOCK_READ)); } void * -rangelock_trywlock(struct rangelock *lock, off_t start, off_t end, - struct mtx *ilk) +rangelock_wlock(struct rangelock *lock, vm_ooffset_t start, vm_ooffset_t end) { + return (rangelock_lock_int(lock, false, start, end, RL_LOCK_WRITE)); +} - return (rangelock_enqueue(lock, start, end, RL_LOCK_WRITE, ilk, true)); +void * +rangelock_trywlock(struct rangelock *lock, vm_ooffset_t start, vm_ooffset_t end) +{ + return (rangelock_lock_int(lock, true, start, end, RL_LOCK_WRITE)); } #ifdef INVARIANT_SUPPORT void _rangelock_cookie_assert(void *cookie, int what, const char *file, int line) { - struct rl_q_entry *entry; - int flags; - - MPASS(cookie != NULL); - entry = cookie; - flags = entry->rl_q_flags; - switch (what) { - case RCA_LOCKED: - if ((flags & RL_LOCK_GRANTED) == 0) - panic("rangelock not held @ %s:%d\n", file, line); - break; - case RCA_RLOCKED: - if ((flags & (RL_LOCK_GRANTED | RL_LOCK_READ)) != - (RL_LOCK_GRANTED | RL_LOCK_READ)) - panic("rangelock not rlocked @ %s:%d\n", file, line); - break; - case RCA_WLOCKED: - if ((flags & (RL_LOCK_GRANTED | RL_LOCK_WRITE)) != - (RL_LOCK_GRANTED | RL_LOCK_WRITE)) - panic("rangelock not wlocked @ %s:%d\n", file, line); - break; - default: - panic("Unknown rangelock assertion: %d @ %s:%d", what, file, - line); - } } #endif /* INVARIANT_SUPPORT */ + +#include "opt_ddb.h" +#ifdef DDB +#include + +DB_SHOW_COMMAND(rangelock, db_show_rangelock) +{ + struct rangelock *lock; + struct rl_q_entry *e, *x; + uintptr_t v; + + if (!have_addr) { + db_printf("show rangelock addr\n"); + return; + } + + lock = (struct rangelock *)addr; + db_printf("rangelock %p sleepers %d\n", lock, lock->sleepers); + v = lock->head; + if ((v & RL_CHEAT_CHEATING) != 0) { + db_printf(" cheating head %#jx\n", (uintmax_t)v); + return; + } + for (e = (struct rl_q_entry *)(lock->head);;) { + x = rl_e_is_marked(e) ? rl_e_unmark(e) : e; + if (x == NULL) + break; + db_printf(" entry %p marked %d %d start %#jx end %#jx " + "flags %x next %p", + e, rl_e_is_marked(e), rl_e_is_marked(x->rl_q_next), + x->rl_q_start, x->rl_q_end, x->rl_q_flags, x->rl_q_next); +#ifdef INVARIANTS + db_printf(" owner %p (%d)", x->rl_q_owner, + x->rl_q_owner != NULL ? x->rl_q_owner->td_tid : -1); +#endif + db_printf("\n"); + e = x->rl_q_next; + } +} + +#endif /* DDB */ diff --git a/sys/kern/kern_thread.c b/sys/kern/kern_thread.c --- a/sys/kern/kern_thread.c +++ b/sys/kern/kern_thread.c @@ -478,9 +478,9 @@ td = (struct thread *)mem; EVENTHANDLER_DIRECT_INVOKE(thread_fini, td); - rlqentry_free(td->td_rlqe); turnstile_free(td->td_turnstile); sleepq_free(td->td_sleepqueue); + rangelock_entry_free(td->td_rlqe); umtx_thread_fini(td); MPASS(td->td_sel == NULL); } diff --git a/sys/kern/uipc_shm.c b/sys/kern/uipc_shm.c --- a/sys/kern/uipc_shm.c +++ b/sys/kern/uipc_shm.c @@ -184,13 +184,13 @@ "Number of contig reclaims before giving up for default alloc policy"); #define shm_rangelock_unlock(shmfd, cookie) \ - rangelock_unlock(&(shmfd)->shm_rl, (cookie), &(shmfd)->shm_mtx) + rangelock_unlock(&(shmfd)->shm_rl, (cookie)) #define shm_rangelock_rlock(shmfd, start, end) \ - rangelock_rlock(&(shmfd)->shm_rl, (start), (end), &(shmfd)->shm_mtx) + rangelock_rlock(&(shmfd)->shm_rl, (start), (end)) #define shm_rangelock_tryrlock(shmfd, start, end) \ - rangelock_tryrlock(&(shmfd)->shm_rl, (start), (end), &(shmfd)->shm_mtx) + rangelock_tryrlock(&(shmfd)->shm_rl, (start), (end)) #define shm_rangelock_wlock(shmfd, start, end) \ - rangelock_wlock(&(shmfd)->shm_rl, (start), (end), &(shmfd)->shm_mtx) + rangelock_wlock(&(shmfd)->shm_rl, (start), (end)) static int uiomove_object_page(vm_object_t obj, size_t len, struct uio *uio) diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -2180,8 +2180,6 @@ VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0")); VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp, ("dirty blk trie not empty")); - VNASSERT(TAILQ_EMPTY(&vp->v_rl.rl_waiters), vp, - ("Dangling rangelock waiters")); VNASSERT((vp->v_iflag & (VI_DOINGINACT | VI_OWEINACT)) == 0, vp, ("Leaked inactivation")); VI_UNLOCK(vp); diff --git a/sys/sys/rangelock.h b/sys/sys/rangelock.h --- a/sys/sys/rangelock.h +++ b/sys/sys/rangelock.h @@ -29,12 +29,14 @@ #ifndef _SYS_RANGELOCK_H #define _SYS_RANGELOCK_H -#include +#include +#ifndef _KERNEL +#include +#endif #define RL_LOCK_READ 0x0001 #define RL_LOCK_WRITE 0x0002 #define RL_LOCK_TYPE_MASK 0x0003 -#define RL_LOCK_GRANTED 0x0004 struct rl_q_entry; @@ -44,42 +46,26 @@ * all existing lock owners are compatible with the request. Two lock * owners are compatible if their ranges do not overlap, or both * owners are for read. - * - * Access to the structure itself is synchronized with the externally - * supplied mutex. - * - * rl_waiters is the queue containing in order (a) granted write lock - * requests, (b) granted read lock requests, and (c) in order of arrival, - * lock requests which cannot be granted yet. - * - * rl_currdep is the first lock request that cannot be granted now due - * to the preceding requests conflicting with it (i.e., it points to - * position (c) in the list above). */ struct rangelock { - TAILQ_HEAD(, rl_q_entry) rl_waiters; - struct rl_q_entry *rl_currdep; + uintptr_t head; + bool sleepers; }; #ifdef _KERNEL -struct mtx; - void rangelock_init(struct rangelock *lock); void rangelock_destroy(struct rangelock *lock); -void rangelock_unlock(struct rangelock *lock, void *cookie, - struct mtx *ilk); -void *rangelock_unlock_range(struct rangelock *lock, void *cookie, - off_t start, off_t end, struct mtx *ilk); -void *rangelock_rlock(struct rangelock *lock, off_t start, off_t end, - struct mtx *ilk); -void *rangelock_tryrlock(struct rangelock *lock, off_t start, off_t end, - struct mtx *ilk); -void *rangelock_wlock(struct rangelock *lock, off_t start, off_t end, - struct mtx *ilk); -void *rangelock_trywlock(struct rangelock *lock, off_t start, off_t end, - struct mtx *ilk); -void rlqentry_free(struct rl_q_entry *rlqe); +void rangelock_unlock(struct rangelock *lock, void *cookie); +void *rangelock_rlock(struct rangelock *lock, vm_ooffset_t start, + vm_ooffset_t end); +void *rangelock_tryrlock(struct rangelock *lock, vm_ooffset_t start, + vm_ooffset_t end); +void *rangelock_wlock(struct rangelock *lock, vm_ooffset_t start, + vm_ooffset_t end); +void *rangelock_trywlock(struct rangelock *lock, vm_ooffset_t start, + vm_ooffset_t end); +void rangelock_entry_free(struct rl_q_entry *e); #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT) void _rangelock_cookie_assert(void *cookie, int what, const char *file, int line); diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h --- a/sys/sys/vnode.h +++ b/sys/sys/vnode.h @@ -833,18 +833,15 @@ #define vn_seqc_consistent(vp, seq) seqc_consistent(&(vp)->v_seqc, seq) #define vn_rangelock_unlock(vp, cookie) \ - rangelock_unlock(&(vp)->v_rl, (cookie), VI_MTX(vp)) -#define vn_rangelock_unlock_range(vp, cookie, start, end) \ - rangelock_unlock_range(&(vp)->v_rl, (cookie), (start), (end), \ - VI_MTX(vp)) + rangelock_unlock(&(vp)->v_rl, (cookie)) #define vn_rangelock_rlock(vp, start, end) \ - rangelock_rlock(&(vp)->v_rl, (start), (end), VI_MTX(vp)) + rangelock_rlock(&(vp)->v_rl, (start), (end)) #define vn_rangelock_tryrlock(vp, start, end) \ - rangelock_tryrlock(&(vp)->v_rl, (start), (end), VI_MTX(vp)) + rangelock_tryrlock(&(vp)->v_rl, (start), (end)) #define vn_rangelock_wlock(vp, start, end) \ - rangelock_wlock(&(vp)->v_rl, (start), (end), VI_MTX(vp)) + rangelock_wlock(&(vp)->v_rl, (start), (end)) #define vn_rangelock_trywlock(vp, start, end) \ - rangelock_trywlock(&(vp)->v_rl, (start), (end), VI_MTX(vp)) + rangelock_trywlock(&(vp)->v_rl, (start), (end)) #define vn_irflag_read(vp) atomic_load_short(&(vp)->v_irflag) void vn_irflag_set_locked(struct vnode *vp, short toset);