Index: sys/amd64/amd64/pmap.c =================================================================== --- sys/amd64/amd64/pmap.c +++ sys/amd64/amd64/pmap.c @@ -114,6 +114,7 @@ #include #include #include +#include #include #include #include @@ -349,6 +350,7 @@ vm_paddr_t dmaplimit; vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS; pt_entry_t pg_nx; +static epoch_t pmap_epoch; static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); @@ -438,14 +440,12 @@ CTLFLAG_MPSAFE, NULL, 0, pmap_pcid_save_cnt_proc, "QU", "Count of saved TLB context on switch"); -static LIST_HEAD(, pmap_invl_gen) pmap_invl_gen_tracker = - LIST_HEAD_INITIALIZER(&pmap_invl_gen_tracker); -static struct mtx invl_gen_mtx; -static u_long pmap_invl_gen = 0; -/* Fake lock object to satisfy turnstiles interface. */ -static struct lock_object invl_gen_ts = { - .lo_name = "invlts", -}; +static void +pmap_epoch_init(void *arg __unused) +{ + pmap_epoch = epoch_alloc(EPOCH_PREEMPT); +} +SYSINIT(epoch, SI_SUB_TASKQ + 1, SI_ORDER_ANY, pmap_epoch_init, NULL); static bool pmap_not_in_di(void) @@ -468,19 +468,8 @@ static void pmap_delayed_invl_started(void) { - struct pmap_invl_gen *invl_gen; - u_long currgen; - - invl_gen = &curthread->td_md.md_invl_gen; - PMAP_ASSERT_NOT_IN_DI(); - mtx_lock(&invl_gen_mtx); - if (LIST_EMPTY(&pmap_invl_gen_tracker)) - currgen = pmap_invl_gen; - else - currgen = LIST_FIRST(&pmap_invl_gen_tracker)->gen; - invl_gen->gen = currgen + 1; - LIST_INSERT_HEAD(&pmap_invl_gen_tracker, invl_gen, link); - mtx_unlock(&invl_gen_mtx); + epoch_enter_preempt(pmap_epoch); + curthread->td_md.md_invl_gen.gen = 1; } /* @@ -500,28 +489,8 @@ static void pmap_delayed_invl_finished(void) { - struct pmap_invl_gen *invl_gen, *next; - struct turnstile *ts; - - invl_gen = &curthread->td_md.md_invl_gen; - KASSERT(invl_gen->gen != 0, ("missed invl_started")); - mtx_lock(&invl_gen_mtx); - next = LIST_NEXT(invl_gen, link); - if (next == NULL) { - turnstile_chain_lock(&invl_gen_ts); - ts = turnstile_lookup(&invl_gen_ts); - pmap_invl_gen = invl_gen->gen; - if (ts != NULL) { - turnstile_broadcast(ts, TS_SHARED_QUEUE); - turnstile_unpend(ts); - } - turnstile_chain_unlock(&invl_gen_ts); - } else { - next->gen = invl_gen->gen; - } - LIST_REMOVE(invl_gen, link); - mtx_unlock(&invl_gen_mtx); - invl_gen->gen = 0; + curthread->td_md.md_invl_gen.gen = 0; + epoch_exit_preempt(pmap_epoch); } #ifdef PV_STATS @@ -544,36 +513,14 @@ * pmap_delayed_invl_wait(), upon its return we know that no CPU has a * valid mapping for the page m in either its page table or TLB. * - * This function works by blocking until the global DI generation - * number catches up with the generation number associated with the - * given page m and its PV list. Since this function's callers - * typically own an object lock and sometimes own a page lock, it - * cannot sleep. Instead, it blocks on a turnstile to relinquish the - * processor. + * This function works by checking that there are either no callers + * within a DI block or if there are that a grace period elapses for + * any callers in an epoch section when it is initially called. */ static void pmap_delayed_invl_wait(vm_page_t m) { - struct turnstile *ts; - u_long *m_gen; -#ifdef PV_STATS - bool accounted = false; -#endif - - m_gen = pmap_delayed_invl_genp(m); - while (*m_gen > pmap_invl_gen) { -#ifdef PV_STATS - if (!accounted) { - atomic_add_long(&invl_wait, 1); - accounted = true; - } -#endif - ts = turnstile_trywait(&invl_gen_ts); - if (*m_gen > pmap_invl_gen) - turnstile_wait(ts, NULL, TS_SHARED_QUEUE); - else - turnstile_cancel(ts); - } + epoch_wait_preempt(pmap_epoch); } /* @@ -1130,11 +1077,6 @@ TAILQ_INIT(&kernel_pmap->pm_pvchunk); kernel_pmap->pm_flags = pmap_flags; - /* - * Initialize the TLB invalidations generation number lock. - */ - mtx_init(&invl_gen_mtx, "invlgn", NULL, MTX_DEF); - /* * Reserve some special page table entries/VA space for temporary * mapping of pages. Index: sys/amd64/include/counter.h =================================================================== --- sys/amd64/include/counter.h +++ sys/amd64/include/counter.h @@ -45,7 +45,7 @@ counter_u64_read_one(uint64_t *p, int cpu) { - return (*(uint64_t *)((char *)p + sizeof(struct pcpu) * cpu)); + return (*(uint64_t *)((char *)p + UMA_PCPU_ZONE_SIZE * cpu)); } static inline uint64_t @@ -65,7 +65,7 @@ counter_u64_zero_one_cpu(void *arg) { - *((uint64_t *)((char *)arg + sizeof(struct pcpu) * + *((uint64_t *)((char *)arg + UMA_PCPU_ZONE_SIZE * PCPU_GET(cpuid))) = 0; } Index: sys/arm/include/counter.h =================================================================== --- sys/arm/include/counter.h +++ sys/arm/include/counter.h @@ -47,7 +47,7 @@ counter_u64_read_one(uint64_t *p, int cpu) { - return (atomic_load_64((uint64_t *)((char *)p + sizeof(struct pcpu) * + return (atomic_load_64((uint64_t *)((char *)p + UMA_PCPU_ZONE_SIZE * cpu))); } @@ -68,7 +68,7 @@ counter_u64_zero_one_cpu(void *arg) { - atomic_store_64((uint64_t *)((char *)arg + sizeof(struct pcpu) * + atomic_store_64((uint64_t *)((char *)arg + UMA_PCPU_ZONE_SIZE * PCPU_GET(cpuid)), 0); } Index: sys/arm64/include/counter.h =================================================================== --- sys/arm64/include/counter.h +++ sys/arm64/include/counter.h @@ -44,7 +44,7 @@ counter_u64_read_one(uint64_t *p, int cpu) { - return (*(uint64_t *)((char *)p + sizeof(struct pcpu) * cpu)); + return (*(uint64_t *)((char *)p + UMA_PCPU_ZONE_SIZE * cpu)); } static inline uint64_t @@ -64,7 +64,7 @@ counter_u64_zero_one_cpu(void *arg) { - *((uint64_t *)((char *)arg + sizeof(struct pcpu) * + *((uint64_t *)((char *)arg + UMA_PCPU_ZONE_SIZE * PCPU_GET(cpuid))) = 0; } Index: sys/conf/files =================================================================== --- sys/conf/files +++ sys/conf/files @@ -3894,6 +3894,8 @@ kern/subr_msgbuf.c standard kern/subr_param.c standard kern/subr_pcpu.c standard +kern/subr_pcpu_quota.c standard +kern/subr_pcpu_refcount.c standard kern/subr_pctrie.c standard kern/subr_pidctrl.c standard kern/subr_power.c standard Index: sys/i386/include/counter.h =================================================================== --- sys/i386/include/counter.h +++ sys/i386/include/counter.h @@ -104,13 +104,13 @@ critical_enter(); CPU_FOREACH(i) { res += *(uint64_t *)((char *)p + - sizeof(struct pcpu) * i); + UMA_PCPU_ZONE_SIZE * i); } critical_exit(); } else { CPU_FOREACH(i) res += counter_u64_read_one_8b((uint64_t *)((char *)p + - sizeof(struct pcpu) * i)); + UMA_PCPU_ZONE_SIZE * i)); } return (res); } @@ -137,7 +137,7 @@ { uint64_t *p; - p = (uint64_t *)((char *)arg + sizeof(struct pcpu) * PCPU_GET(cpuid)); + p = (uint64_t *)((char *)arg + UMA_PCPU_ZONE_SIZE * PCPU_GET(cpuid)); counter_u64_zero_one_8b(p); } @@ -149,7 +149,7 @@ if ((cpu_feature & CPUID_CX8) == 0) { critical_enter(); CPU_FOREACH(i) - *(uint64_t *)((char *)c + sizeof(struct pcpu) * i) = 0; + *(uint64_t *)((char *)c + UMA_PCPU_ZONE_SIZE * i) = 0; critical_exit(); } else { smp_rendezvous(smp_no_rendezvous_barrier, Index: sys/kern/kern_prot.c =================================================================== --- sys/kern/kern_prot.c +++ sys/kern/kern_prot.c @@ -1829,7 +1829,8 @@ struct ucred *cr; cr = malloc(sizeof(*cr), M_CRED, M_WAITOK | M_ZERO); - refcount_init(&cr->cr_ref, 1); + cr->cr_pref = pcpu_ref_alloc(M_WAITOK); + #ifdef AUDIT audit_cred_init(cr); #endif @@ -1848,11 +1849,19 @@ struct ucred * crhold(struct ucred *cr) { + if (cr->cr_flags & CRED_FLAG_ONSTACK) + return (cr); - refcount_acquire(&cr->cr_ref); + pcpu_ref_acquire(cr->cr_pref); return (cr); } +void +crdrop_owner(struct ucred *cr) +{ + pcpu_ref_kill(cr->cr_pref); +} + /* * Free a cred structure. Throws away space when ref count gets to 0. */ @@ -1860,9 +1869,12 @@ crfree(struct ucred *cr) { - KASSERT(cr->cr_ref > 0, ("bad ucred refcount: %d", cr->cr_ref)); - KASSERT(cr->cr_ref != 0xdeadc0de, ("dangling reference to ucred")); - if (refcount_release(&cr->cr_ref)) { + KASSERT((unsigned int)((uintptr_t)cr->cr_pref) != 0xdeadc0de, ("cr: %p dangling reference to ucred", cr)); + if (cr->cr_flags & CRED_FLAG_ONSTACK) + return; + + if (pcpu_ref_release(cr->cr_pref)) { + pcpu_ref_free(cr->cr_pref); /* * Some callers of crget(), such as nfs_statfs(), * allocate a temporary credential, but don't @@ -1898,7 +1910,7 @@ crcopy(struct ucred *dest, struct ucred *src) { - KASSERT(dest->cr_ref == 1, ("crcopy of shared ucred")); + //KASSERT(dest->cr_ref == 1, ("crcopy of shared ucred")); bcopy(&src->cr_startcopy, &dest->cr_startcopy, (unsigned)((caddr_t)&src->cr_endcopy - (caddr_t)&src->cr_startcopy)); @@ -1913,6 +1925,7 @@ #ifdef MAC mac_cred_copy(src, dest); #endif + dest->cr_flags &= ~(CRED_FLAG_ONSTACK|CRED_FLAG_OWNED); } /* @@ -1953,7 +1966,16 @@ void proc_set_cred_init(struct proc *p, struct ucred *newcred) { - + struct ucred *dupcred; +#ifdef notyet + if (newcred->cr_flags & CRED_FLAG_OWNED) +#endif + { + dupcred = crdup(newcred); + crfree(newcred); + newcred = dupcred; + } + newcred->cr_flags |= CRED_FLAG_OWNED; p->p_ucred = newcred; } @@ -1975,10 +1997,23 @@ MPASS(p->p_ucred != NULL); if (newcred == NULL) MPASS(p->p_state == PRS_ZOMBIE); - else + else { +#ifdef notyet + if (newcred->cr_flags & CRED_FLAG_OWNED) + { + oldcred = crdup(newcred); + crfree(newcred); + newcred = oldcred; + newcred->cr_flags |= CRED_FLAG_OWNED; + } +#endif + MPASS((newcred->cr_flags & CRED_FLAG_OWNED) == 0); + newcred->cr_flags |= CRED_FLAG_OWNED; PROC_LOCK_ASSERT(p, MA_OWNED); - + } oldcred = p->p_ucred; + crdrop_owner(oldcred); + p->p_ucred = newcred; if (newcred != NULL) PROC_UPDATE_COW(p); @@ -2002,7 +2037,6 @@ oldcred = p->p_ucred; } crcopy(cr, oldcred); - return (oldcred); } Index: sys/kern/kern_resource.c =================================================================== --- sys/kern/kern_resource.c +++ sys/kern/kern_resource.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include #include @@ -65,6 +66,7 @@ #include #include #include +#include static MALLOC_DEFINE(M_PLIMIT, "plimit", "plimit structures"); @@ -1244,6 +1246,7 @@ return (uip); } + /* * Find or allocate a struct uidinfo for a particular uid. * Returns with uidinfo struct referenced. @@ -1276,7 +1279,8 @@ racct_create(&new_uip->ui_racct); refcount_init(&new_uip->ui_ref, 1); new_uip->ui_uid = uid; - mtx_init(&new_uip->ui_vmsize_mtx, "ui_vmsize", NULL, MTX_DEF); + new_uip->ui_vmsize_pq = pcpu_quota_alloc(&new_uip->ui_vmsize, + vmsize_max_pcpu_slop, swap_pager_vmsize_alloc, new_uip, M_WAITOK); rw_wlock(&uihashtbl_lock); /* @@ -1291,7 +1295,6 @@ } else { rw_wunlock(&uihashtbl_lock); racct_destroy(&new_uip->ui_racct); - mtx_destroy(&new_uip->ui_vmsize_mtx); free(new_uip, M_UIDINFO); } return (uip); @@ -1343,6 +1346,7 @@ LIST_REMOVE(uip, ui_hash); rw_wunlock(&uihashtbl_lock); + pcpu_quota_cache_set(uip->ui_vmsize_pq, 0); if (uip->ui_sbsize != 0) printf("freeing uidinfo: uid = %d, sbsize = %ld\n", uip->ui_uid, uip->ui_sbsize); @@ -1352,7 +1356,6 @@ if (uip->ui_vmsize != 0) printf("freeing uidinfo: uid = %d, swapuse = %lld\n", uip->ui_uid, (unsigned long long)uip->ui_vmsize); - mtx_destroy(&uip->ui_vmsize_mtx); free(uip, M_UIDINFO); } Index: sys/kern/subr_counter.c =================================================================== --- sys/kern/subr_counter.c +++ sys/kern/subr_counter.c @@ -50,6 +50,15 @@ counter_u64_zero_inline(c); } +static void +counter_u64_zero_sync(counter_u64_t c) +{ + int cpu; + + CPU_FOREACH(cpu) + *(uint64_t*)zpcpu_get_cpu(c, cpu) = 0; +} + uint64_t counter_u64_fetch(counter_u64_t c) { @@ -64,7 +73,7 @@ r = uma_zalloc_pcpu(pcpu_zone_64, flags); if (r != NULL) - counter_u64_zero(r); + counter_u64_zero_sync(r); return (r); } Index: sys/kern/subr_pcpu.c =================================================================== --- sys/kern/subr_pcpu.c +++ sys/kern/subr_pcpu.c @@ -75,8 +75,8 @@ static DPCPU_DEFINE(char, modspace[DPCPU_MODMIN]); static TAILQ_HEAD(, dpcpu_free) dpcpu_head = TAILQ_HEAD_INITIALIZER(dpcpu_head); static struct sx dpcpu_lock; -uintptr_t dpcpu_off[MAXCPU]; -struct pcpu *cpuid_to_pcpu[MAXCPU]; +__read_mostly uintptr_t dpcpu_off[MAXCPU]; +__read_mostly struct pcpu *cpuid_to_pcpu[MAXCPU]; struct cpuhead cpuhead = STAILQ_HEAD_INITIALIZER(cpuhead); /* Index: sys/kern/subr_pcpu_quota.c =================================================================== --- /dev/null +++ sys/kern/subr_pcpu_quota.c @@ -0,0 +1,184 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2018, Matthew Macy + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#include +__FBSDID("$FreeBSD$"); +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +static MALLOC_DEFINE(M_PCPU_QUOTA, "Per-cpu", "Per-cpu resource accounting."); + +#define PCPU_QUOTA_SLOP_GET(p) zpcpu_get((p)->pq_slop) +#define PCPU_QUOTA_CAN_CACHE 0x1 +#define PCPU_QUOTA_FLUSHING 0x2 + +struct pcpu_quota { + void *pq_context; + counter_u64_t pq_slop; + uintptr_t *pq_global; + uintptr_t pq_pcpu_slop; + int (*pq_alloc)(void *context, uintptr_t incr, uintptr_t *slop); + volatile int pq_flags; +} __aligned(CACHE_LINE_SIZE); + + +#ifdef __LP64__ +#define atomic_subtract_uintptr atomic_subtract_long +#else +#define atomic_subtract_uintptr atomic_subtract_int +#endif + +static void +pcpu_quota_flush(struct pcpu_quota *pq) +{ + int64_t *p; + uintptr_t value; + int cpu; + + value = 0; + epoch_enter(global_epoch); + CPU_FOREACH(cpu) { + p = zpcpu_get_cpu(pq->pq_slop, cpu); + MPASS(*p >= 0); + value += *p; + *p = 0; + } + if (value) + atomic_subtract_uintptr(pq->pq_global, value); + epoch_exit(global_epoch); +} + +void +pcpu_quota_cache_set(struct pcpu_quota *pq, int enable) +{ + int *flagsp; + + flagsp = (int *)(uintptr_t)&pq->pq_flags; + if (!enable && (pq->pq_flags & PCPU_QUOTA_CAN_CACHE)) { + if (ck_pr_btr_int(flagsp, PCPU_QUOTA_CAN_CACHE) == 0 && + ck_pr_bts_int(flagsp, PCPU_QUOTA_FLUSHING) == 0) { + epoch_wait(global_epoch); + pcpu_quota_flush(pq); + ck_pr_btr_int(flagsp, PCPU_QUOTA_FLUSHING); + } + } else if (enable && (pq->pq_flags & PCPU_QUOTA_CAN_CACHE) == 0) { + while (pq->pq_flags & PCPU_QUOTA_FLUSHING) + cpu_spinwait(); + ck_pr_bts_int(flagsp, PCPU_QUOTA_CAN_CACHE); + } +} + +struct pcpu_quota * +pcpu_quota_alloc(uintptr_t *global, uintptr_t pcpu_slop, + int (*alloc)(void *, uintptr_t, uintptr_t*), void *context, int flags) +{ + struct pcpu_quota *pq; + + flags &= ~M_ZERO; + if ((pq = malloc(sizeof(*pq), M_PCPU_QUOTA, flags)) == NULL) + return (NULL); + if ((pq->pq_slop = counter_u64_alloc(flags)) == NULL) { + free(pq, M_PCPU_QUOTA); + return (NULL); + } + pq->pq_pcpu_slop = pcpu_slop; + pq->pq_context = context; + pq->pq_global = global; + pq->pq_alloc = alloc; + pq->pq_flags = PCPU_QUOTA_CAN_CACHE; + return (pq); +} + +void +pcpu_quota_free(struct pcpu_quota *pq) +{ + counter_u64_free(pq->pq_slop); + free(pq, M_PCPU_QUOTA); +} + +int +pcpu_quota_incr(struct pcpu_quota *pq, uintptr_t incr) +{ + int64_t *p; + int rc; + + epoch_enter(global_epoch); + p = PCPU_QUOTA_SLOP_GET(pq); + if (*p >= incr) { + *p -= incr; + epoch_exit(global_epoch); + return (1); + } + incr -= *p; + *p = 0; + rc = pq->pq_alloc(pq->pq_context, incr, (uintptr_t *)p); + if ( __predict_false((pq->pq_flags & PCPU_QUOTA_CAN_CACHE) == 0) && *p > 0) + pcpu_quota_cache_set(pq, 1); + + epoch_exit(global_epoch); + return (rc); +} + +void +pcpu_quota_decr(struct pcpu_quota *pq, uintptr_t decr) +{ + int64_t *p; + int64_t value; + long adj; + + epoch_enter(global_epoch); + p = PCPU_QUOTA_SLOP_GET(pq); + if (__predict_true(pq->pq_flags & PCPU_QUOTA_CAN_CACHE)) { + if (*p + decr <= pq->pq_pcpu_slop) { + *p += decr; + epoch_exit(global_epoch); + return; + } + adj = (pq->pq_pcpu_slop >> 1); + value = decr + (*p - adj); + } else { + adj = 0; + value = *p + decr; + } + MPASS(value > 0); + *p = adj; + atomic_subtract_uintptr(pq->pq_global, (uintptr_t)value); + epoch_exit(global_epoch); +} + Index: sys/kern/subr_pcpu_refcount.c =================================================================== --- /dev/null +++ sys/kern/subr_pcpu_refcount.c @@ -0,0 +1,164 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2018, Matthew Macy + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#include +__FBSDID("$FreeBSD$"); +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static MALLOC_DEFINE(M_PCPU_REF, "Pcpuref", "Per-cpu reference counting."); +#define PR_DYING 0x1 + +#define OWNER_REFCOUNT (INT_MAX >> 2) + + +struct pcpu_ref { + counter_u64_t pr_pcpu_refs; + volatile int pr_refcnt; + int pr_flags; +} __aligned(CACHE_LINE_SIZE); + +pcpu_ref_t +pcpu_ref_alloc(int flags) +{ + pcpu_ref_t pr; + + pr = malloc(sizeof(*pr), M_PCPU_REF, flags); + if (pr == NULL) + return (NULL); + if ((pr->pr_pcpu_refs = counter_u64_alloc(flags)) == NULL) { + free(pr, M_PCPU_REF); + return (NULL); + } + pr->pr_flags = 0; + pr->pr_refcnt = OWNER_REFCOUNT; +#ifdef INVARIANTS + int cpu; + int64_t sum = 0; + CPU_FOREACH(cpu) + sum += *(int64_t*)zpcpu_get_cpu(pr->pr_pcpu_refs, cpu); + KASSERT(sum == 0, ("sum: %jd != 0", sum)); +#endif + return (pr); +} + +void +pcpu_ref_free(pcpu_ref_t pr) +{ + counter_u64_free(pr->pr_pcpu_refs); + free(pr, M_PCPU_REF); +} + +void +pcpu_ref_incr(pcpu_ref_t pr, int incr) +{ + epoch_enter(global_epoch); +#ifdef INVARIANTS + int64_t sum = 0; + int refcount, cpu; + + refcount = pr->pr_refcnt; + if (__predict_true((pr->pr_flags & PR_DYING) == 0)) { + CPU_FOREACH(cpu) + sum += *(int64_t*)zpcpu_get_cpu(pr->pr_pcpu_refs, cpu); + refcount -= OWNER_REFCOUNT-1; + } + KASSERT(sum + refcount > -2, ("sum: %jd + refcount: %d <= 0", sum, refcount)); + if (sum + refcount <= 0) { + printf("sum: %jd + refcount: %d <= 0", sum, refcount); + kdb_backtrace(); + } +#endif + if (__predict_false(pr->pr_flags & PR_DYING)) + atomic_add_int(&pr->pr_refcnt, incr); + else + *(int64_t*)zpcpu_get(pr->pr_pcpu_refs) += incr; + epoch_exit(global_epoch); +} + +int +pcpu_ref_decr(pcpu_ref_t pr, int decr) +{ + int rc, value; + epoch_enter(global_epoch); +#ifdef INVARIANTS + int64_t sum = 0; + int cpu, refcount; + + refcount = pr->pr_refcnt; + if (__predict_true((pr->pr_flags & PR_DYING) == 0)) { + CPU_FOREACH(cpu) + sum += *(int64_t*)zpcpu_get_cpu(pr->pr_pcpu_refs, cpu); + refcount -= OWNER_REFCOUNT-1; + } + + KASSERT(sum + refcount >= decr, ("sum: %jd + refcount: %d < decr: %d", + sum, refcount, decr)); +#endif + rc = 0; + if (__predict_true((pr->pr_flags & PR_DYING) == 0)) + *(int64_t*)zpcpu_get(pr->pr_pcpu_refs) -= decr; + else { + value = atomic_fetchadd_int(&pr->pr_refcnt, -decr); + MPASS(value >= decr); + if (value == decr) + rc = 1; + } + epoch_exit(global_epoch); + return (rc); +} + +void +pcpu_ref_kill(pcpu_ref_t pr) +{ + int cpu, sum, value; + + MPASS((pr->pr_flags & PR_DYING) == 0); + pr->pr_flags |= PR_DYING; + epoch_wait(global_epoch); + sum = 0; + CPU_FOREACH(cpu) + sum += *(int64_t*)zpcpu_get_cpu(pr->pr_pcpu_refs, cpu); +#ifdef INVARIANTS + KASSERT(sum + pr->pr_refcnt >= OWNER_REFCOUNT, ("sum: %d + pr_refcnt: %d < owner: %d", + sum, pr->pr_refcnt, OWNER_REFCOUNT)); +#endif + + value = atomic_fetchadd_int(&pr->pr_refcnt, sum-OWNER_REFCOUNT+1); + MPASS(value + sum >= OWNER_REFCOUNT); +} Index: sys/mips/include/counter.h =================================================================== --- sys/mips/include/counter.h +++ sys/mips/include/counter.h @@ -47,7 +47,7 @@ counter_u64_read_one(uint64_t *p, int cpu) { - return (*(uint64_t *)((char *)p + sizeof(struct pcpu) * cpu)); + return (*(uint64_t *)((char *)p + UMA_PCPU_ZONE_SIZE * cpu)); } static inline uint64_t @@ -68,7 +68,7 @@ counter_u64_zero_one_cpu(void *arg) { - *((uint64_t *)((char *)arg + sizeof(struct pcpu) * + *((uint64_t *)((char *)arg + UMA_PCPU_ZONE_SIZE * PCPU_GET(cpuid))) = 0; } Index: sys/powerpc/include/counter.h =================================================================== --- sys/powerpc/include/counter.h +++ sys/powerpc/include/counter.h @@ -50,7 +50,7 @@ counter_u64_read_one(uint64_t *p, int cpu) { - return (*(uint64_t *)((char *)p + sizeof(struct pcpu) * cpu)); + return (*(uint64_t *)((char *)p + UMA_PCPU_ZONE_SIZE * cpu)); } static inline uint64_t @@ -70,7 +70,7 @@ counter_u64_zero_one_cpu(void *arg) { - *((uint64_t *)((char *)arg + sizeof(struct pcpu) * + *((uint64_t *)((char *)arg + UMA_PCPU_ZONE_SIZE * PCPU_GET(cpuid))) = 0; } @@ -113,7 +113,7 @@ counter_u64_read_one(uint64_t *p, int cpu) { - return (*(uint64_t *)((char *)p + sizeof(struct pcpu) * cpu)); + return (*(uint64_t *)((char *)p + UMA_PCPU_ZONE_SIZE * cpu)); } static inline uint64_t @@ -134,7 +134,7 @@ counter_u64_zero_one_cpu(void *arg) { - *((uint64_t *)((char *)arg + sizeof(struct pcpu) * + *((uint64_t *)((char *)arg + UMA_PCPU_ZONE_SIZE * PCPU_GET(cpuid))) = 0; } Index: sys/riscv/include/counter.h =================================================================== --- sys/riscv/include/counter.h +++ sys/riscv/include/counter.h @@ -46,7 +46,7 @@ counter_u64_read_one(uint64_t *p, int cpu) { - return (*(uint64_t *)((char *)p + sizeof(struct pcpu) * cpu)); + return (*(uint64_t *)((char *)p + UMA_PCPU_ZONE_SIZE * cpu)); } static inline uint64_t @@ -67,7 +67,7 @@ counter_u64_zero_one_cpu(void *arg) { - *((uint64_t *)((char *)arg + sizeof(struct pcpu) * + *((uint64_t *)((char *)arg + UMA_PCPU_ZONE_SIZE * PCPU_GET(cpuid))) = 0; } Index: sys/sparc64/include/counter.h =================================================================== --- sys/sparc64/include/counter.h +++ sys/sparc64/include/counter.h @@ -47,7 +47,7 @@ counter_u64_read_one(uint64_t *p, int cpu) { - return (*(uint64_t *)((char *)p + sizeof(struct pcpu) * cpu)); + return (*(uint64_t *)((char *)p + UMA_PCPU_ZONE_SIZE * cpu)); } static inline uint64_t @@ -68,7 +68,7 @@ counter_u64_zero_one_cpu(void *arg) { - *((uint64_t *)((char *)arg + sizeof(struct pcpu) * + *((uint64_t *)((char *)arg + UMA_PCPU_ZONE_SIZE * PCPU_GET(cpuid))) = 0; } Index: sys/sys/pcpu.h =================================================================== --- sys/sys/pcpu.h +++ sys/sys/pcpu.h @@ -208,14 +208,16 @@ zpcpu_get(void *base) { - return ((char *)(base) + sizeof(struct pcpu) * curcpu); + /* UMA_PCPU_ZONE_SIZE == PAGE_SIZE */ + return ((char *)(base) + PAGE_SIZE * curcpu); } static inline void * zpcpu_get_cpu(void *base, int cpu) { - return ((char *)(base) + sizeof(struct pcpu) * cpu); + /* UMA_PCPU_ZONE_SIZE == PAGE_SIZE */ + return ((char *)(base) + PAGE_SIZE * cpu); } /* Index: sys/sys/pcpu_quota.h =================================================================== --- /dev/null +++ sys/sys/pcpu_quota.h @@ -0,0 +1,42 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2018, Matthew Macy + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SYS_PCPU_QUOTA_H_ +#define _SYS_PCPU_QUOTA_H_ +struct pcpu_quota; + +struct pcpu_quota *pcpu_quota_alloc(uintptr_t *global, uintptr_t pcpu_slop, + int (*alloc)(void *, uintptr_t, uintptr_t *), void *context, int flags); + +void pcpu_quota_cache_set(struct pcpu_quota *pq, int enable); +void pcpu_quota_free(struct pcpu_quota *pq); +int pcpu_quota_incr(struct pcpu_quota *pq, uintptr_t incr); +void pcpu_quota_decr(struct pcpu_quota *pq, uintptr_t decr); + +#endif Index: sys/sys/pcpu_refcount.h =================================================================== --- /dev/null +++ sys/sys/pcpu_refcount.h @@ -0,0 +1,55 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2018, Matthew Macy + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SYS_PCPU_REFCOUNT_H_ +#define _SYS_PCPU_REFCOUNT_H_ + + +struct pcpu_ref; +typedef struct pcpu_ref *pcpu_ref_t; + +pcpu_ref_t pcpu_ref_alloc(int flags); +void pcpu_ref_free(pcpu_ref_t pr); +void pcpu_ref_incr(pcpu_ref_t pr, int incr); +int pcpu_ref_decr(pcpu_ref_t pr, int decr); +void pcpu_ref_kill(pcpu_ref_t pr); + +static inline void +pcpu_ref_acquire(pcpu_ref_t pr) +{ + pcpu_ref_incr(pr, 1); +} + +static inline int +pcpu_ref_release(pcpu_ref_t pr) +{ + return (pcpu_ref_decr(pr, 1)); +} + +#endif Index: sys/sys/resourcevar.h =================================================================== --- sys/sys/resourcevar.h +++ sys/sys/resourcevar.h @@ -97,8 +97,8 @@ */ struct uidinfo { LIST_ENTRY(uidinfo) ui_hash; /* (c) hash chain of uidinfos */ - struct mtx ui_vmsize_mtx; - vm_ooffset_t ui_vmsize; /* (d) swap reservation by uid */ + vm_offset_t ui_vmsize; /* (d) swap reservation by uid */ + struct pcpu_quota *ui_vmsize_pq; long ui_sbsize; /* (b) socket buffer space consumed */ long ui_proccnt; /* (b) number of processes */ long ui_ptscnt; /* (b) number of pseudo-terminals */ Index: sys/sys/ucred.h =================================================================== --- sys/sys/ucred.h +++ sys/sys/ucred.h @@ -48,8 +48,9 @@ * priv(9) interface should be used to check for privilege. */ #if defined(_KERNEL) || defined(_WANT_UCRED) +#include struct ucred { - u_int cr_ref; /* reference count */ + pcpu_ref_t cr_pref; /* pcpu reference count */ #define cr_startcopy cr_uid uid_t cr_uid; /* effective user id */ uid_t cr_ruid; /* real user id */ @@ -78,6 +79,8 @@ * Flags for cr_flags. */ #define CRED_FLAG_CAPMODE 0x00000001 /* In capability mode. */ +#define CRED_FLAG_ONSTACK 0x00000002 /* Stack allocated */ +#define CRED_FLAG_OWNED 0x00000004 /* Has an owner */ /* * This is the external representation of struct ucred. @@ -111,6 +114,7 @@ void proc_set_cred_init(struct proc *p, struct ucred *cr); struct ucred *proc_set_cred(struct proc *p, struct ucred *cr); void crfree(struct ucred *cr); +void crdrop_owner(struct ucred *cr); struct ucred *crget(void); struct ucred *crhold(struct ucred *cr); void cru2x(struct ucred *cr, struct xucred *xcr); Index: sys/ufs/ufs/ufs_vnops.c =================================================================== --- sys/ufs/ufs/ufs_vnops.c +++ sys/ufs/ufs/ufs_vnops.c @@ -1846,7 +1846,7 @@ * XXX This seems to never be accessed out of * our context so a stack variable is ok. */ - refcount_init(&ucred.cr_ref, 1); + ucred.cr_flags = CRED_FLAG_ONSTACK; ucred.cr_uid = ip->i_uid; ucred.cr_ngroups = 1; ucred.cr_groups = &ucred_group; @@ -2610,7 +2610,7 @@ * XXX This seems to never be accessed out of our * context so a stack variable is ok. */ - refcount_init(&ucred.cr_ref, 1); + ucred.cr_flags = CRED_FLAG_ONSTACK; ucred.cr_uid = ip->i_uid; ucred.cr_ngroups = 1; ucred.cr_groups = &ucred_group; Index: sys/vm/swap_pager.h =================================================================== --- sys/vm/swap_pager.h +++ sys/vm/swap_pager.h @@ -76,6 +76,8 @@ #ifdef _KERNEL extern int swap_pager_avail; +extern vm_offset_t vmsize_max_pcpu_slop; +extern vm_offset_t vmsize_max_slop; struct xswdev; int swap_dev_info(int name, struct xswdev *xs, char *devname, size_t len); @@ -86,6 +88,7 @@ int swap_pager_nswapdev(void); int swap_pager_reserve(vm_object_t, vm_pindex_t, vm_size_t); void swap_pager_status(int *total, int *used); +int swap_pager_vmsize_alloc(void *arg, vm_offset_t incr, vm_offset_t *slop); void swapoff_all(void); #endif /* _KERNEL */ Index: sys/vm/swap_pager.c =================================================================== --- sys/vm/swap_pager.c +++ sys/vm/swap_pager.c @@ -88,6 +88,7 @@ #include #include #include +#include #include #include #include @@ -98,6 +99,7 @@ #include #include #include +#include #include #include @@ -154,9 +156,22 @@ static vm_ooffset_t swap_total; SYSCTL_QUAD(_vm, OID_AUTO, swap_total, CTLFLAG_RD, &swap_total, 0, "Total amount of available swap storage."); -static vm_ooffset_t swap_reserved; +static vm_offset_t swap_max_pcpu_slop; +static vm_offset_t swap_max_slop; +vm_offset_t vmsize_max_pcpu_slop; +vm_offset_t vmsize_max_slop; +static vm_offset_t swap_reserved; +#ifdef __LP64__ +SYSCTL_QUAD(_vm, OID_AUTO, swap_max_slop, CTLFLAG_RD, &swap_max_slop, 0, + "maximum amount of slop in swap accounting."); SYSCTL_QUAD(_vm, OID_AUTO, swap_reserved, CTLFLAG_RD, &swap_reserved, 0, "Amount of swap storage needed to back all allocated anonymous memory."); +#else +SYSCTL_INT(_vm, OID_AUTO, swap_max_slop, CTLFLAG_RD, &swap_max_slop, 0, + "maximum amount of slop in swap accounting."); +SYSCTL_INT(_vm, OID_AUTO, swap_reserved, CTLFLAG_RD, &swap_reserved, 0, + "Amount of swap storage needed to back all allocated anonymous memory."); +#endif static int overcommit = 0; SYSCTL_INT(_vm, VM_OVERCOMMIT, overcommit, CTLFLAG_RW, &overcommit, 0, "Configure virtual memory overcommit behavior. See tuning(7) " @@ -173,18 +188,149 @@ #define SWAP_RESERVE_RLIMIT_ON (1 << 1) #define SWAP_RESERVE_ALLOW_NONWIRED (1 << 2) +#ifdef __LP64__ +#define atomic_fetchadd_uintptr atomic_fetchadd_long +#define atomic_subtract_uintptr atomic_subtract_long +#define atomic_add_uintptr atomic_add_long +#else +#define atomic_fetchadd_uintptr atomic_fetchadd_int +#define atomic_subtract_uintptr atomic_subtract_int +#define atomic_add_uintptr atomic_add_int +#endif + +struct pcpu_quota *swap_reserve_pq; int -swap_reserve(vm_ooffset_t incr) +swap_reserve(vm_offset_t incr) { return (swap_reserve_by_cred(incr, curthread->td_ucred)); } +static int +swap_alloc_can_cache(void) +{ + vm_offset_t s; + + if ((overcommit & SWAP_RESERVE_FORCE_ON) == 0) + return (1); + if (overcommit & SWAP_RESERVE_ALLOW_NONWIRED) { + s = vm_cnt.v_page_count - vm_cnt.v_free_reserved - + vm_wire_count(); + s *= PAGE_SIZE; + } else + s = 0; + s += swap_total; + if (__predict_true(2*swap_max_slop < swap_total - swap_reserved)) + return (1); + + return (0); +} + +static int +swap_alloc_slow(void *arg __unused, uintptr_t incr, uintptr_t *slop) +{ + vm_offset_t r, s, new, adj; + int res, can_cache; + + can_cache = swap_alloc_can_cache(); + adj = (swap_max_pcpu_slop>>1); + MPASS(*slop == 0); + if (can_cache) { + incr += adj; + } else if (__predict_false(swap_max_slop > swap_total - swap_reserved)) + pcpu_quota_cache_set(swap_reserve_pq, 0); + + res = 0; + new = atomic_fetchadd_uintptr(&swap_reserved, incr); + r = new + incr; + if (overcommit & SWAP_RESERVE_ALLOW_NONWIRED) { + s = vm_cnt.v_page_count - vm_cnt.v_free_reserved - + vm_wire_count(); + s *= PAGE_SIZE; + } else + s = 0; + s += swap_total; + if ((overcommit & SWAP_RESERVE_FORCE_ON) == 0 || r <= s || + priv_check(curthread, PRIV_VM_SWAP_NOQUOTA) == 0) { + res = 1; + *slop = can_cache*adj; + } else + atomic_subtract_uintptr(&swap_reserved, incr); + + return (res); +} + +static void +swap_alloc_init(void *arg __unused) +{ + uint64_t slop_pages_pcpu; + + slop_pages_pcpu = physmem / (8*mp_ncpus); + swap_max_pcpu_slop = slop_pages_pcpu*PAGE_SIZE; + swap_max_slop = swap_max_pcpu_slop*mp_ncpus; + vmsize_max_pcpu_slop = (slop_pages_pcpu >> 1)*PAGE_SIZE; + vmsize_max_slop = vmsize_max_pcpu_slop*mp_ncpus; + swap_reserve_pq = pcpu_quota_alloc(&swap_reserved, swap_max_pcpu_slop, + swap_alloc_slow, NULL, M_WAITOK); +} +SYSINIT(swap_alloc_init, SI_SUB_VM_CONF, SI_ORDER_ANY, swap_alloc_init, NULL); + +static int +swap_alloc(vm_offset_t incr) +{ + return (pcpu_quota_incr(swap_reserve_pq, incr)); +} + +static void +swap_free(vm_offset_t decr) +{ + return (pcpu_quota_decr(swap_reserve_pq, decr)); +} + int -swap_reserve_by_cred(vm_ooffset_t incr, struct ucred *cred) +swap_pager_vmsize_alloc(void *arg, uintptr_t incr, uintptr_t *slop) { - vm_ooffset_t r, s; - int res, error; + struct uidinfo *uip; + int can_cache; + vm_offset_t new, adj; + + uip = arg; + MPASS(*slop == 0); + adj = (vmsize_max_pcpu_slop >> 1); + if ((overcommit & SWAP_RESERVE_RLIMIT_ON) == 0) { + *slop = adj; + incr += adj; + atomic_add_uintptr(&uip->ui_vmsize, incr); + return (1); + } + + if (__predict_false((overcommit & SWAP_RESERVE_RLIMIT_ON) && + uip->ui_vmsize + swap_max_slop > lim_cur(curthread, RLIMIT_SWAP))) + pcpu_quota_cache_set(uip->ui_vmsize_pq, 0); + if ((overcommit & SWAP_RESERVE_RLIMIT_ON) != 0 && + uip->ui_vmsize + incr > lim_cur(curthread, RLIMIT_SWAP) && + priv_check(curthread, PRIV_VM_SWAP_NORLIMIT)) + return (0); + can_cache = 0; + if ((overcommit & SWAP_RESERVE_RLIMIT_ON) == 0 || + uip->ui_vmsize + 2*swap_max_slop < lim_cur(curthread, RLIMIT_SWAP)) + can_cache = 1; + + incr += can_cache*adj; + new = atomic_fetchadd_uintptr(&uip->ui_vmsize, incr); + if ((overcommit & SWAP_RESERVE_RLIMIT_ON) != 0 && + new + incr > lim_cur(curthread, RLIMIT_SWAP)) { + atomic_subtract_uintptr(&uip->ui_vmsize, incr); + return (0); + } + *slop = can_cache*adj; + return (1); +} + +int +swap_reserve_by_cred(vm_offset_t incr, struct ucred *cred) +{ + int res; static int curfail; static struct timeval lastfail; struct uidinfo *uip; @@ -197,52 +343,26 @@ #ifdef RACCT if (racct_enable) { PROC_LOCK(curproc); - error = racct_add(curproc, RACCT_SWAP, incr); + res = racct_add(curproc, RACCT_SWAP, incr); PROC_UNLOCK(curproc); - if (error != 0) + if (res != 0) return (0); } #endif - res = 0; - mtx_lock(&sw_dev_mtx); - r = swap_reserved + incr; - if (overcommit & SWAP_RESERVE_ALLOW_NONWIRED) { - s = vm_cnt.v_page_count - vm_cnt.v_free_reserved - - vm_wire_count(); - s *= PAGE_SIZE; - } else - s = 0; - s += swap_total; - if ((overcommit & SWAP_RESERVE_FORCE_ON) == 0 || r <= s || - (error = priv_check(curthread, PRIV_VM_SWAP_NOQUOTA)) == 0) { - res = 1; - swap_reserved = r; - } - mtx_unlock(&sw_dev_mtx); - + res = swap_alloc(incr); if (res) { - UIDINFO_VMSIZE_LOCK(uip); - if ((overcommit & SWAP_RESERVE_RLIMIT_ON) != 0 && - uip->ui_vmsize + incr > lim_cur(curthread, RLIMIT_SWAP) && - priv_check(curthread, PRIV_VM_SWAP_NORLIMIT)) - res = 0; - else - uip->ui_vmsize += incr; - UIDINFO_VMSIZE_UNLOCK(uip); - if (!res) { - mtx_lock(&sw_dev_mtx); - swap_reserved -= incr; - mtx_unlock(&sw_dev_mtx); - } + res = pcpu_quota_incr(uip->ui_vmsize_pq, incr); + if (!res) + swap_free(incr); } if (!res && ppsratecheck(&lastfail, &curfail, 1)) { printf("uid %d, pid %d: swap reservation for %jd bytes failed\n", - uip->ui_uid, curproc->p_pid, incr); + uip->ui_uid, curproc->p_pid, (intmax_t)incr); } #ifdef RACCT - if (!res) { + if (racct_enable && !res) { PROC_LOCK(curproc); racct_sub(curproc, RACCT_SWAP, incr); PROC_UNLOCK(curproc); @@ -253,41 +373,36 @@ } void -swap_reserve_force(vm_ooffset_t incr) +swap_reserve_force(vm_offset_t incr) { struct uidinfo *uip; - mtx_lock(&sw_dev_mtx); - swap_reserved += incr; - mtx_unlock(&sw_dev_mtx); + if (swap_alloc(incr) == 0) + atomic_add_uintptr(&swap_reserved, incr); #ifdef RACCT - PROC_LOCK(curproc); - racct_add_force(curproc, RACCT_SWAP, incr); - PROC_UNLOCK(curproc); + if (racct_enable) { + PROC_LOCK(curproc); + racct_add_force(curproc, RACCT_SWAP, incr); + PROC_UNLOCK(curproc); + } #endif uip = curthread->td_ucred->cr_ruidinfo; - PROC_LOCK(curproc); - UIDINFO_VMSIZE_LOCK(uip); - uip->ui_vmsize += incr; - UIDINFO_VMSIZE_UNLOCK(uip); - PROC_UNLOCK(curproc); + atomic_add_uintptr(&uip->ui_vmsize, incr); } void -swap_release(vm_ooffset_t decr) +swap_release(vm_offset_t decr) { struct ucred *cred; - PROC_LOCK(curproc); cred = curthread->td_ucred; swap_release_by_cred(decr, cred); - PROC_UNLOCK(curproc); } void -swap_release_by_cred(vm_ooffset_t decr, struct ucred *cred) +swap_release_by_cred(vm_offset_t decr, struct ucred *cred) { struct uidinfo *uip; @@ -296,19 +411,15 @@ if (decr & PAGE_MASK) panic("swap_release: & PAGE_MASK"); - mtx_lock(&sw_dev_mtx); if (swap_reserved < decr) panic("swap_reserved < decr"); - swap_reserved -= decr; - mtx_unlock(&sw_dev_mtx); + swap_free(decr); - UIDINFO_VMSIZE_LOCK(uip); if (uip->ui_vmsize < decr) printf("negative vmsize for uid = %d\n", uip->ui_uid); - uip->ui_vmsize -= decr; - UIDINFO_VMSIZE_UNLOCK(uip); - - racct_sub_cred(cred, RACCT_SWAP, decr); + pcpu_quota_decr(uip->ui_vmsize_pq, decr); + if (racct_enable) + racct_sub_cred(cred, RACCT_SWAP, decr); } #define SWM_POP 0x01 /* pop out */ Index: sys/vm/uma.h =================================================================== --- sys/vm/uma.h +++ sys/vm/uma.h @@ -44,6 +44,8 @@ /* User visible parameters */ #define UMA_SMALLEST_UNIT (PAGE_SIZE / 256) /* Smallest item allocated */ +#define UMA_PCPU_ZONE_SIZE PAGE_SIZE + /* Types and type defs */ struct uma_zone; @@ -279,8 +281,7 @@ * mini-dumps. */ #define UMA_ZONE_PCPU 0x8000 /* - * Allocates mp_maxid + 1 slabs sized to - * sizeof(struct pcpu). + * Allocates mp_maxid + 1 slabs of PAGE_SIZE */ #define UMA_ZONE_NUMA 0x10000 /* * NUMA aware Zone. Implements a best Index: sys/vm/uma_core.c =================================================================== --- sys/vm/uma_core.c +++ sys/vm/uma_core.c @@ -229,8 +229,10 @@ static void *noobj_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); static void *page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); +static void *pcpu_page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); static void *startup_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); static void page_free(void *, vm_size_t, uint8_t); +static void pcpu_page_free(void *, vm_size_t, uint8_t); static uma_slab_t keg_alloc_slab(uma_keg_t, uma_zone_t, int, int); static void cache_drain(uma_zone_t); static void bucket_drain(uma_zone_t, uma_bucket_t); @@ -1172,6 +1174,54 @@ return (p); } +static void * +pcpu_page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag, + int wait) +{ + TAILQ_HEAD(, vm_page) alloctail; + vm_offset_t addr, zkva; + struct pcpu *pc; + int cpu; + vm_page_t p, p_next; + + TAILQ_INIT(&alloctail); + MPASS(bytes == (mp_maxid+1)*PAGE_SIZE); + *pflag = UMA_SLAB_KERNEL; + + for (cpu = 0; cpu <= mp_maxid; cpu++) { + if (CPU_ABSENT(cpu)) { + p = vm_page_alloc_domain(NULL, 0, 0, VM_ALLOC_INTERRUPT | + VM_ALLOC_WIRED | VM_ALLOC_NOOBJ | + ((wait & M_WAITOK) != 0 ? VM_ALLOC_WAITOK : + VM_ALLOC_NOWAIT)); + + } else { + pc = pcpu_find(cpu); + p = vm_page_alloc_domain(NULL, 0, pc->pc_domain, VM_ALLOC_INTERRUPT | + VM_ALLOC_WIRED | VM_ALLOC_NOOBJ | + ((wait & M_WAITOK) != 0 ? VM_ALLOC_WAITOK : + VM_ALLOC_NOWAIT)); + } + if (__predict_false(p == NULL)) + goto fail; + TAILQ_INSERT_TAIL(&alloctail, p, listq); + } + if ((addr = kva_alloc(bytes)) == 0) + goto fail; + zkva = addr; + TAILQ_FOREACH(p, &alloctail, listq) { + pmap_qenter(zkva, &p, 1); + zkva += PAGE_SIZE; + } + return ((void*)addr); + fail: + TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) { + vm_page_unwire(p, PQ_NONE); + vm_page_free(p); + } + return (NULL); +} + /* * Allocates a number of pages from within an object * @@ -1257,6 +1307,37 @@ kmem_free(vmem, (vm_offset_t)mem, size); } +/* + * Frees pcpu zone allocations + * + * Arguments: + * mem A pointer to the memory to be freed + * size The size of the memory being freed + * flags The original p->us_flags field + * + * Returns: + * Nothing + */ +static void +pcpu_page_free(void *mem, vm_size_t size, uint8_t flags) +{ + vm_offset_t sva, curva; + vm_paddr_t paddr; + vm_page_t m; + + MPASS(size == (mp_maxid+1)*PAGE_SIZE); + sva = (vm_offset_t)mem; + for (curva = sva; curva < sva + size; curva += PAGE_SIZE) { + paddr = pmap_kextract(curva); + m = PHYS_TO_VM_PAGE(paddr); + vm_page_unwire(m, PQ_NONE); + vm_page_free(m); + } + pmap_qremove(sva, size >> PAGE_SHIFT); + kva_free(sva, size); +} + + /* * Zero fill initializer * @@ -1290,9 +1371,8 @@ if (keg->uk_flags & UMA_ZONE_PCPU) { u_int ncpus = (mp_maxid + 1) ? (mp_maxid + 1) : MAXCPU; - slabsize = sizeof(struct pcpu); - keg->uk_ppera = howmany(ncpus * sizeof(struct pcpu), - PAGE_SIZE); + slabsize = PAGE_SIZE; + keg->uk_ppera = ncpus; } else { slabsize = UMA_SLAB_SIZE; keg->uk_ppera = 1; @@ -1311,7 +1391,7 @@ keg->uk_rsize = rsize; KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0 || - keg->uk_rsize < sizeof(struct pcpu), + keg->uk_rsize < UMA_PCPU_ZONE_SIZE, ("%s: size %u too large", __func__, keg->uk_rsize)); if (keg->uk_flags & UMA_ZONE_OFFPAGE) @@ -1529,6 +1609,8 @@ else if (keg->uk_ppera == 1) keg->uk_allocf = uma_small_alloc; #endif + else if (keg->uk_flags & UMA_ZONE_PCPU) + keg->uk_allocf = pcpu_page_alloc; else keg->uk_allocf = page_alloc; #ifdef UMA_MD_SMALL_ALLOC @@ -1536,6 +1618,9 @@ keg->uk_freef = uma_small_free; else #endif + if (keg->uk_flags & UMA_ZONE_PCPU) + keg->uk_freef = pcpu_page_free; + else keg->uk_freef = page_free; /* Index: sys/vm/uma_int.h =================================================================== --- sys/vm/uma_int.h +++ sys/vm/uma_int.h @@ -222,9 +222,8 @@ * */ struct uma_keg { - struct mtx_padalign uk_lock; /* Lock for the keg */ + struct mtx uk_lock; /* Lock for the keg */ struct uma_hash uk_hash; - LIST_HEAD(,uma_zone) uk_zones; /* Keg's zones */ uint32_t uk_cursor; /* Domain alloc cursor. */ @@ -315,41 +314,49 @@ * */ struct uma_zone { - struct mtx_padalign uz_lock; /* Lock for the zone */ - struct mtx_padalign *uz_lockptr; - const char *uz_name; /* Text name of the zone */ - - LIST_ENTRY(uma_zone) uz_link; /* List of all zones in keg */ + /* Offset 0, used in alloc/free fast/medium fast path and const. */ + struct mtx *uz_lockptr; + const char *uz_name; /* Text name of the zone */ struct uma_zone_domain *uz_domain; /* per-domain buckets */ - - LIST_HEAD(,uma_klink) uz_kegs; /* List of kegs. */ - struct uma_klink uz_klink; /* klink for first keg. */ - - uma_slaballoc uz_slab; /* Allocate a slab from the backend. */ + uint32_t uz_flags; /* Flags inherited from kegs */ + uint32_t uz_size; /* Size inherited from kegs */ uma_ctor uz_ctor; /* Constructor for each allocation */ uma_dtor uz_dtor; /* Destructor */ uma_init uz_init; /* Initializer for each item */ uma_fini uz_fini; /* Finalizer for each item. */ + + /* Offset 64, used in bucket replenish. */ uma_import uz_import; /* Import new memory to cache. */ uma_release uz_release; /* Release memory from cache. */ void *uz_arg; /* Import/release argument. */ - - uint32_t uz_flags; /* Flags inherited from kegs */ - uint32_t uz_size; /* Size inherited from kegs */ - - volatile u_long uz_allocs UMA_ALIGN; /* Total number of allocations */ - volatile u_long uz_fails; /* Total number of alloc failures */ - volatile u_long uz_frees; /* Total number of frees */ - uint64_t uz_sleeps; /* Total number of alloc sleeps */ + uma_slaballoc uz_slab; /* Allocate a slab from the backend. */ uint16_t uz_count; /* Amount of items in full bucket */ uint16_t uz_count_min; /* Minimal amount of items there */ + /* 32bit pad on 64bit. */ + LIST_ENTRY(uma_zone) uz_link; /* List of all zones in keg */ + LIST_HEAD(,uma_klink) uz_kegs; /* List of kegs. */ + /* Offset 128 Rare. */ + /* + * The lock is placed here to avoid adjacent line prefetcher + * in fast paths and to take up space near infrequently accessed + * members to reduce alignment overhead. + */ + struct mtx uz_lock; /* Lock for the zone */ + struct uma_klink uz_klink; /* klink for first keg. */ /* The next two fields are used to print a rate-limited warnings. */ const char *uz_warning; /* Warning to print on failure */ struct timeval uz_ratecheck; /* Warnings rate-limiting */ - struct task uz_maxaction; /* Task to run when at limit */ + /* 16 bytes of pad. */ + + /* Offset 256, atomic stats. */ + volatile u_long uz_allocs UMA_ALIGN; /* Total number of allocations */ + volatile u_long uz_fails; /* Total number of alloc failures */ + volatile u_long uz_frees; /* Total number of frees */ + uint64_t uz_sleeps; /* Total number of alloc sleeps */ + /* * This HAS to be the last item because we adjust the zone size * based on NCPU and then allocate the space for the zones. Index: sys/vm/vm.h =================================================================== --- sys/vm/vm.h +++ sys/vm/vm.h @@ -151,11 +151,11 @@ extern int vm_ndomains; struct ucred; -int swap_reserve(vm_ooffset_t incr); -int swap_reserve_by_cred(vm_ooffset_t incr, struct ucred *cred); -void swap_reserve_force(vm_ooffset_t incr); -void swap_release(vm_ooffset_t decr); -void swap_release_by_cred(vm_ooffset_t decr, struct ucred *cred); +int swap_reserve(vm_offset_t incr); +int swap_reserve_by_cred(vm_offset_t incr, struct ucred *cred); +void swap_reserve_force(vm_offset_t incr); +void swap_release(vm_offset_t decr); +void swap_release_by_cred(vm_offset_t decr, struct ucred *cred); void swapper(void); #endif /* VM_H */ Index: sys/x86/acpica/srat.c =================================================================== --- sys/x86/acpica/srat.c +++ sys/x86/acpica/srat.c @@ -517,12 +517,15 @@ static void srat_set_cpus(void *dummy) { +#ifdef NUMA struct cpu_info *cpu; struct pcpu *pc; u_int i; +#endif if (srat_physaddr == 0) return; +#ifdef NUMA for (i = 0; i < MAXCPU; i++) { if (CPU_ABSENT(i)) continue; @@ -538,7 +541,7 @@ printf("SRAT: CPU %u has memory domain %d\n", i, cpu->domain); } - +#endif /* Last usage of the cpus array, unmap it. */ pmap_unmapbios((vm_offset_t)cpus, sizeof(*cpus) * (max_apic_id + 1)); cpus = NULL;