Page MenuHomeFreeBSD

D15975.diff
No OneTemporary

D15975.diff

Index: sys/amd64/amd64/pmap.c
===================================================================
--- sys/amd64/amd64/pmap.c
+++ sys/amd64/amd64/pmap.c
@@ -114,6 +114,7 @@
#include <sys/bitstring.h>
#include <sys/bus.h>
#include <sys/systm.h>
+#include <sys/epoch.h>
#include <sys/kernel.h>
#include <sys/ktr.h>
#include <sys/lock.h>
@@ -349,6 +350,7 @@
vm_paddr_t dmaplimit;
vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS;
pt_entry_t pg_nx;
+static epoch_t pmap_epoch;
static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
@@ -438,14 +440,12 @@
CTLFLAG_MPSAFE, NULL, 0, pmap_pcid_save_cnt_proc, "QU",
"Count of saved TLB context on switch");
-static LIST_HEAD(, pmap_invl_gen) pmap_invl_gen_tracker =
- LIST_HEAD_INITIALIZER(&pmap_invl_gen_tracker);
-static struct mtx invl_gen_mtx;
-static u_long pmap_invl_gen = 0;
-/* Fake lock object to satisfy turnstiles interface. */
-static struct lock_object invl_gen_ts = {
- .lo_name = "invlts",
-};
+static void
+pmap_epoch_init(void *arg __unused)
+{
+ pmap_epoch = epoch_alloc(EPOCH_PREEMPT);
+}
+SYSINIT(epoch, SI_SUB_TASKQ + 1, SI_ORDER_ANY, pmap_epoch_init, NULL);
static bool
pmap_not_in_di(void)
@@ -468,19 +468,8 @@
static void
pmap_delayed_invl_started(void)
{
- struct pmap_invl_gen *invl_gen;
- u_long currgen;
-
- invl_gen = &curthread->td_md.md_invl_gen;
- PMAP_ASSERT_NOT_IN_DI();
- mtx_lock(&invl_gen_mtx);
- if (LIST_EMPTY(&pmap_invl_gen_tracker))
- currgen = pmap_invl_gen;
- else
- currgen = LIST_FIRST(&pmap_invl_gen_tracker)->gen;
- invl_gen->gen = currgen + 1;
- LIST_INSERT_HEAD(&pmap_invl_gen_tracker, invl_gen, link);
- mtx_unlock(&invl_gen_mtx);
+ epoch_enter_preempt(pmap_epoch);
+ curthread->td_md.md_invl_gen.gen = 1;
}
/*
@@ -500,28 +489,8 @@
static void
pmap_delayed_invl_finished(void)
{
- struct pmap_invl_gen *invl_gen, *next;
- struct turnstile *ts;
-
- invl_gen = &curthread->td_md.md_invl_gen;
- KASSERT(invl_gen->gen != 0, ("missed invl_started"));
- mtx_lock(&invl_gen_mtx);
- next = LIST_NEXT(invl_gen, link);
- if (next == NULL) {
- turnstile_chain_lock(&invl_gen_ts);
- ts = turnstile_lookup(&invl_gen_ts);
- pmap_invl_gen = invl_gen->gen;
- if (ts != NULL) {
- turnstile_broadcast(ts, TS_SHARED_QUEUE);
- turnstile_unpend(ts);
- }
- turnstile_chain_unlock(&invl_gen_ts);
- } else {
- next->gen = invl_gen->gen;
- }
- LIST_REMOVE(invl_gen, link);
- mtx_unlock(&invl_gen_mtx);
- invl_gen->gen = 0;
+ curthread->td_md.md_invl_gen.gen = 0;
+ epoch_exit_preempt(pmap_epoch);
}
#ifdef PV_STATS
@@ -544,36 +513,14 @@
* pmap_delayed_invl_wait(), upon its return we know that no CPU has a
* valid mapping for the page m in either its page table or TLB.
*
- * This function works by blocking until the global DI generation
- * number catches up with the generation number associated with the
- * given page m and its PV list. Since this function's callers
- * typically own an object lock and sometimes own a page lock, it
- * cannot sleep. Instead, it blocks on a turnstile to relinquish the
- * processor.
+ * This function works by checking that there are either no callers
+ * within a DI block or if there are that a grace period elapses for
+ * any callers in an epoch section when it is initially called.
*/
static void
pmap_delayed_invl_wait(vm_page_t m)
{
- struct turnstile *ts;
- u_long *m_gen;
-#ifdef PV_STATS
- bool accounted = false;
-#endif
-
- m_gen = pmap_delayed_invl_genp(m);
- while (*m_gen > pmap_invl_gen) {
-#ifdef PV_STATS
- if (!accounted) {
- atomic_add_long(&invl_wait, 1);
- accounted = true;
- }
-#endif
- ts = turnstile_trywait(&invl_gen_ts);
- if (*m_gen > pmap_invl_gen)
- turnstile_wait(ts, NULL, TS_SHARED_QUEUE);
- else
- turnstile_cancel(ts);
- }
+ epoch_wait_preempt(pmap_epoch);
}
/*
@@ -1130,11 +1077,6 @@
TAILQ_INIT(&kernel_pmap->pm_pvchunk);
kernel_pmap->pm_flags = pmap_flags;
- /*
- * Initialize the TLB invalidations generation number lock.
- */
- mtx_init(&invl_gen_mtx, "invlgn", NULL, MTX_DEF);
-
/*
* Reserve some special page table entries/VA space for temporary
* mapping of pages.
Index: sys/amd64/include/counter.h
===================================================================
--- sys/amd64/include/counter.h
+++ sys/amd64/include/counter.h
@@ -45,7 +45,7 @@
counter_u64_read_one(uint64_t *p, int cpu)
{
- return (*(uint64_t *)((char *)p + sizeof(struct pcpu) * cpu));
+ return (*(uint64_t *)((char *)p + UMA_PCPU_ZONE_SIZE * cpu));
}
static inline uint64_t
@@ -65,7 +65,7 @@
counter_u64_zero_one_cpu(void *arg)
{
- *((uint64_t *)((char *)arg + sizeof(struct pcpu) *
+ *((uint64_t *)((char *)arg + UMA_PCPU_ZONE_SIZE *
PCPU_GET(cpuid))) = 0;
}
Index: sys/arm/include/counter.h
===================================================================
--- sys/arm/include/counter.h
+++ sys/arm/include/counter.h
@@ -47,7 +47,7 @@
counter_u64_read_one(uint64_t *p, int cpu)
{
- return (atomic_load_64((uint64_t *)((char *)p + sizeof(struct pcpu) *
+ return (atomic_load_64((uint64_t *)((char *)p + UMA_PCPU_ZONE_SIZE *
cpu)));
}
@@ -68,7 +68,7 @@
counter_u64_zero_one_cpu(void *arg)
{
- atomic_store_64((uint64_t *)((char *)arg + sizeof(struct pcpu) *
+ atomic_store_64((uint64_t *)((char *)arg + UMA_PCPU_ZONE_SIZE *
PCPU_GET(cpuid)), 0);
}
Index: sys/arm64/include/counter.h
===================================================================
--- sys/arm64/include/counter.h
+++ sys/arm64/include/counter.h
@@ -44,7 +44,7 @@
counter_u64_read_one(uint64_t *p, int cpu)
{
- return (*(uint64_t *)((char *)p + sizeof(struct pcpu) * cpu));
+ return (*(uint64_t *)((char *)p + UMA_PCPU_ZONE_SIZE * cpu));
}
static inline uint64_t
@@ -64,7 +64,7 @@
counter_u64_zero_one_cpu(void *arg)
{
- *((uint64_t *)((char *)arg + sizeof(struct pcpu) *
+ *((uint64_t *)((char *)arg + UMA_PCPU_ZONE_SIZE *
PCPU_GET(cpuid))) = 0;
}
Index: sys/conf/files
===================================================================
--- sys/conf/files
+++ sys/conf/files
@@ -3894,6 +3894,8 @@
kern/subr_msgbuf.c standard
kern/subr_param.c standard
kern/subr_pcpu.c standard
+kern/subr_pcpu_quota.c standard
+kern/subr_pcpu_refcount.c standard
kern/subr_pctrie.c standard
kern/subr_pidctrl.c standard
kern/subr_power.c standard
Index: sys/i386/include/counter.h
===================================================================
--- sys/i386/include/counter.h
+++ sys/i386/include/counter.h
@@ -104,13 +104,13 @@
critical_enter();
CPU_FOREACH(i) {
res += *(uint64_t *)((char *)p +
- sizeof(struct pcpu) * i);
+ UMA_PCPU_ZONE_SIZE * i);
}
critical_exit();
} else {
CPU_FOREACH(i)
res += counter_u64_read_one_8b((uint64_t *)((char *)p +
- sizeof(struct pcpu) * i));
+ UMA_PCPU_ZONE_SIZE * i));
}
return (res);
}
@@ -137,7 +137,7 @@
{
uint64_t *p;
- p = (uint64_t *)((char *)arg + sizeof(struct pcpu) * PCPU_GET(cpuid));
+ p = (uint64_t *)((char *)arg + UMA_PCPU_ZONE_SIZE * PCPU_GET(cpuid));
counter_u64_zero_one_8b(p);
}
@@ -149,7 +149,7 @@
if ((cpu_feature & CPUID_CX8) == 0) {
critical_enter();
CPU_FOREACH(i)
- *(uint64_t *)((char *)c + sizeof(struct pcpu) * i) = 0;
+ *(uint64_t *)((char *)c + UMA_PCPU_ZONE_SIZE * i) = 0;
critical_exit();
} else {
smp_rendezvous(smp_no_rendezvous_barrier,
Index: sys/kern/kern_prot.c
===================================================================
--- sys/kern/kern_prot.c
+++ sys/kern/kern_prot.c
@@ -1829,7 +1829,8 @@
struct ucred *cr;
cr = malloc(sizeof(*cr), M_CRED, M_WAITOK | M_ZERO);
- refcount_init(&cr->cr_ref, 1);
+ cr->cr_pref = pcpu_ref_alloc(M_WAITOK);
+
#ifdef AUDIT
audit_cred_init(cr);
#endif
@@ -1848,11 +1849,19 @@
struct ucred *
crhold(struct ucred *cr)
{
+ if (cr->cr_flags & CRED_FLAG_ONSTACK)
+ return (cr);
- refcount_acquire(&cr->cr_ref);
+ pcpu_ref_acquire(cr->cr_pref);
return (cr);
}
+void
+crdrop_owner(struct ucred *cr)
+{
+ pcpu_ref_kill(cr->cr_pref);
+}
+
/*
* Free a cred structure. Throws away space when ref count gets to 0.
*/
@@ -1860,9 +1869,12 @@
crfree(struct ucred *cr)
{
- KASSERT(cr->cr_ref > 0, ("bad ucred refcount: %d", cr->cr_ref));
- KASSERT(cr->cr_ref != 0xdeadc0de, ("dangling reference to ucred"));
- if (refcount_release(&cr->cr_ref)) {
+ KASSERT((unsigned int)((uintptr_t)cr->cr_pref) != 0xdeadc0de, ("cr: %p dangling reference to ucred", cr));
+ if (cr->cr_flags & CRED_FLAG_ONSTACK)
+ return;
+
+ if (pcpu_ref_release(cr->cr_pref)) {
+ pcpu_ref_free(cr->cr_pref);
/*
* Some callers of crget(), such as nfs_statfs(),
* allocate a temporary credential, but don't
@@ -1898,7 +1910,7 @@
crcopy(struct ucred *dest, struct ucred *src)
{
- KASSERT(dest->cr_ref == 1, ("crcopy of shared ucred"));
+ //KASSERT(dest->cr_ref == 1, ("crcopy of shared ucred"));
bcopy(&src->cr_startcopy, &dest->cr_startcopy,
(unsigned)((caddr_t)&src->cr_endcopy -
(caddr_t)&src->cr_startcopy));
@@ -1913,6 +1925,7 @@
#ifdef MAC
mac_cred_copy(src, dest);
#endif
+ dest->cr_flags &= ~(CRED_FLAG_ONSTACK|CRED_FLAG_OWNED);
}
/*
@@ -1953,7 +1966,16 @@
void
proc_set_cred_init(struct proc *p, struct ucred *newcred)
{
-
+ struct ucred *dupcred;
+#ifdef notyet
+ if (newcred->cr_flags & CRED_FLAG_OWNED)
+#endif
+ {
+ dupcred = crdup(newcred);
+ crfree(newcred);
+ newcred = dupcred;
+ }
+ newcred->cr_flags |= CRED_FLAG_OWNED;
p->p_ucred = newcred;
}
@@ -1975,10 +1997,23 @@
MPASS(p->p_ucred != NULL);
if (newcred == NULL)
MPASS(p->p_state == PRS_ZOMBIE);
- else
+ else {
+#ifdef notyet
+ if (newcred->cr_flags & CRED_FLAG_OWNED)
+ {
+ oldcred = crdup(newcred);
+ crfree(newcred);
+ newcred = oldcred;
+ newcred->cr_flags |= CRED_FLAG_OWNED;
+ }
+#endif
+ MPASS((newcred->cr_flags & CRED_FLAG_OWNED) == 0);
+ newcred->cr_flags |= CRED_FLAG_OWNED;
PROC_LOCK_ASSERT(p, MA_OWNED);
-
+ }
oldcred = p->p_ucred;
+ crdrop_owner(oldcred);
+
p->p_ucred = newcred;
if (newcred != NULL)
PROC_UPDATE_COW(p);
@@ -2002,7 +2037,6 @@
oldcred = p->p_ucred;
}
crcopy(cr, oldcred);
-
return (oldcred);
}
Index: sys/kern/kern_resource.c
===================================================================
--- sys/kern/kern_resource.c
+++ sys/kern/kern_resource.c
@@ -47,6 +47,7 @@
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
+#include <sys/pcpu_quota.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/refcount.h>
@@ -65,6 +66,7 @@
#include <vm/vm_param.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
+#include <vm/swap_pager.h>
static MALLOC_DEFINE(M_PLIMIT, "plimit", "plimit structures");
@@ -1244,6 +1246,7 @@
return (uip);
}
+
/*
* Find or allocate a struct uidinfo for a particular uid.
* Returns with uidinfo struct referenced.
@@ -1276,7 +1279,8 @@
racct_create(&new_uip->ui_racct);
refcount_init(&new_uip->ui_ref, 1);
new_uip->ui_uid = uid;
- mtx_init(&new_uip->ui_vmsize_mtx, "ui_vmsize", NULL, MTX_DEF);
+ new_uip->ui_vmsize_pq = pcpu_quota_alloc(&new_uip->ui_vmsize,
+ vmsize_max_pcpu_slop, swap_pager_vmsize_alloc, new_uip, M_WAITOK);
rw_wlock(&uihashtbl_lock);
/*
@@ -1291,7 +1295,6 @@
} else {
rw_wunlock(&uihashtbl_lock);
racct_destroy(&new_uip->ui_racct);
- mtx_destroy(&new_uip->ui_vmsize_mtx);
free(new_uip, M_UIDINFO);
}
return (uip);
@@ -1343,6 +1346,7 @@
LIST_REMOVE(uip, ui_hash);
rw_wunlock(&uihashtbl_lock);
+ pcpu_quota_cache_set(uip->ui_vmsize_pq, 0);
if (uip->ui_sbsize != 0)
printf("freeing uidinfo: uid = %d, sbsize = %ld\n",
uip->ui_uid, uip->ui_sbsize);
@@ -1352,7 +1356,6 @@
if (uip->ui_vmsize != 0)
printf("freeing uidinfo: uid = %d, swapuse = %lld\n",
uip->ui_uid, (unsigned long long)uip->ui_vmsize);
- mtx_destroy(&uip->ui_vmsize_mtx);
free(uip, M_UIDINFO);
}
Index: sys/kern/subr_counter.c
===================================================================
--- sys/kern/subr_counter.c
+++ sys/kern/subr_counter.c
@@ -50,6 +50,15 @@
counter_u64_zero_inline(c);
}
+static void
+counter_u64_zero_sync(counter_u64_t c)
+{
+ int cpu;
+
+ CPU_FOREACH(cpu)
+ *(uint64_t*)zpcpu_get_cpu(c, cpu) = 0;
+}
+
uint64_t
counter_u64_fetch(counter_u64_t c)
{
@@ -64,7 +73,7 @@
r = uma_zalloc_pcpu(pcpu_zone_64, flags);
if (r != NULL)
- counter_u64_zero(r);
+ counter_u64_zero_sync(r);
return (r);
}
Index: sys/kern/subr_pcpu.c
===================================================================
--- sys/kern/subr_pcpu.c
+++ sys/kern/subr_pcpu.c
@@ -75,8 +75,8 @@
static DPCPU_DEFINE(char, modspace[DPCPU_MODMIN]);
static TAILQ_HEAD(, dpcpu_free) dpcpu_head = TAILQ_HEAD_INITIALIZER(dpcpu_head);
static struct sx dpcpu_lock;
-uintptr_t dpcpu_off[MAXCPU];
-struct pcpu *cpuid_to_pcpu[MAXCPU];
+__read_mostly uintptr_t dpcpu_off[MAXCPU];
+__read_mostly struct pcpu *cpuid_to_pcpu[MAXCPU];
struct cpuhead cpuhead = STAILQ_HEAD_INITIALIZER(cpuhead);
/*
Index: sys/kern/subr_pcpu_quota.c
===================================================================
--- /dev/null
+++ sys/kern/subr_pcpu_quota.c
@@ -0,0 +1,184 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2018, Matthew Macy <mmacy@freebsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/epoch.h>
+#include <sys/systm.h>
+#include <sys/counter.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/pcpu_quota.h>
+#include <sys/smp.h>
+#include <vm/uma.h>
+
+#include <ck_pr.h>
+
+static MALLOC_DEFINE(M_PCPU_QUOTA, "Per-cpu", "Per-cpu resource accounting.");
+
+#define PCPU_QUOTA_SLOP_GET(p) zpcpu_get((p)->pq_slop)
+#define PCPU_QUOTA_CAN_CACHE 0x1
+#define PCPU_QUOTA_FLUSHING 0x2
+
+struct pcpu_quota {
+ void *pq_context;
+ counter_u64_t pq_slop;
+ uintptr_t *pq_global;
+ uintptr_t pq_pcpu_slop;
+ int (*pq_alloc)(void *context, uintptr_t incr, uintptr_t *slop);
+ volatile int pq_flags;
+} __aligned(CACHE_LINE_SIZE);
+
+
+#ifdef __LP64__
+#define atomic_subtract_uintptr atomic_subtract_long
+#else
+#define atomic_subtract_uintptr atomic_subtract_int
+#endif
+
+static void
+pcpu_quota_flush(struct pcpu_quota *pq)
+{
+ int64_t *p;
+ uintptr_t value;
+ int cpu;
+
+ value = 0;
+ epoch_enter(global_epoch);
+ CPU_FOREACH(cpu) {
+ p = zpcpu_get_cpu(pq->pq_slop, cpu);
+ MPASS(*p >= 0);
+ value += *p;
+ *p = 0;
+ }
+ if (value)
+ atomic_subtract_uintptr(pq->pq_global, value);
+ epoch_exit(global_epoch);
+}
+
+void
+pcpu_quota_cache_set(struct pcpu_quota *pq, int enable)
+{
+ int *flagsp;
+
+ flagsp = (int *)(uintptr_t)&pq->pq_flags;
+ if (!enable && (pq->pq_flags & PCPU_QUOTA_CAN_CACHE)) {
+ if (ck_pr_btr_int(flagsp, PCPU_QUOTA_CAN_CACHE) == 0 &&
+ ck_pr_bts_int(flagsp, PCPU_QUOTA_FLUSHING) == 0) {
+ epoch_wait(global_epoch);
+ pcpu_quota_flush(pq);
+ ck_pr_btr_int(flagsp, PCPU_QUOTA_FLUSHING);
+ }
+ } else if (enable && (pq->pq_flags & PCPU_QUOTA_CAN_CACHE) == 0) {
+ while (pq->pq_flags & PCPU_QUOTA_FLUSHING)
+ cpu_spinwait();
+ ck_pr_bts_int(flagsp, PCPU_QUOTA_CAN_CACHE);
+ }
+}
+
+struct pcpu_quota *
+pcpu_quota_alloc(uintptr_t *global, uintptr_t pcpu_slop,
+ int (*alloc)(void *, uintptr_t, uintptr_t*), void *context, int flags)
+{
+ struct pcpu_quota *pq;
+
+ flags &= ~M_ZERO;
+ if ((pq = malloc(sizeof(*pq), M_PCPU_QUOTA, flags)) == NULL)
+ return (NULL);
+ if ((pq->pq_slop = counter_u64_alloc(flags)) == NULL) {
+ free(pq, M_PCPU_QUOTA);
+ return (NULL);
+ }
+ pq->pq_pcpu_slop = pcpu_slop;
+ pq->pq_context = context;
+ pq->pq_global = global;
+ pq->pq_alloc = alloc;
+ pq->pq_flags = PCPU_QUOTA_CAN_CACHE;
+ return (pq);
+}
+
+void
+pcpu_quota_free(struct pcpu_quota *pq)
+{
+ counter_u64_free(pq->pq_slop);
+ free(pq, M_PCPU_QUOTA);
+}
+
+int
+pcpu_quota_incr(struct pcpu_quota *pq, uintptr_t incr)
+{
+ int64_t *p;
+ int rc;
+
+ epoch_enter(global_epoch);
+ p = PCPU_QUOTA_SLOP_GET(pq);
+ if (*p >= incr) {
+ *p -= incr;
+ epoch_exit(global_epoch);
+ return (1);
+ }
+ incr -= *p;
+ *p = 0;
+ rc = pq->pq_alloc(pq->pq_context, incr, (uintptr_t *)p);
+ if ( __predict_false((pq->pq_flags & PCPU_QUOTA_CAN_CACHE) == 0) && *p > 0)
+ pcpu_quota_cache_set(pq, 1);
+
+ epoch_exit(global_epoch);
+ return (rc);
+}
+
+void
+pcpu_quota_decr(struct pcpu_quota *pq, uintptr_t decr)
+{
+ int64_t *p;
+ int64_t value;
+ long adj;
+
+ epoch_enter(global_epoch);
+ p = PCPU_QUOTA_SLOP_GET(pq);
+ if (__predict_true(pq->pq_flags & PCPU_QUOTA_CAN_CACHE)) {
+ if (*p + decr <= pq->pq_pcpu_slop) {
+ *p += decr;
+ epoch_exit(global_epoch);
+ return;
+ }
+ adj = (pq->pq_pcpu_slop >> 1);
+ value = decr + (*p - adj);
+ } else {
+ adj = 0;
+ value = *p + decr;
+ }
+ MPASS(value > 0);
+ *p = adj;
+ atomic_subtract_uintptr(pq->pq_global, (uintptr_t)value);
+ epoch_exit(global_epoch);
+}
+
Index: sys/kern/subr_pcpu_refcount.c
===================================================================
--- /dev/null
+++ sys/kern/subr_pcpu_refcount.c
@@ -0,0 +1,164 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2018, Matthew Macy <mmacy@freebsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/epoch.h>
+#include <sys/systm.h>
+#include <sys/counter.h>
+#include <sys/kdb.h>
+#include <sys/kernel.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/pcpu_refcount.h>
+#include <sys/smp.h>
+
+static MALLOC_DEFINE(M_PCPU_REF, "Pcpuref", "Per-cpu reference counting.");
+#define PR_DYING 0x1
+
+#define OWNER_REFCOUNT (INT_MAX >> 2)
+
+
+struct pcpu_ref {
+ counter_u64_t pr_pcpu_refs;
+ volatile int pr_refcnt;
+ int pr_flags;
+} __aligned(CACHE_LINE_SIZE);
+
+pcpu_ref_t
+pcpu_ref_alloc(int flags)
+{
+ pcpu_ref_t pr;
+
+ pr = malloc(sizeof(*pr), M_PCPU_REF, flags);
+ if (pr == NULL)
+ return (NULL);
+ if ((pr->pr_pcpu_refs = counter_u64_alloc(flags)) == NULL) {
+ free(pr, M_PCPU_REF);
+ return (NULL);
+ }
+ pr->pr_flags = 0;
+ pr->pr_refcnt = OWNER_REFCOUNT;
+#ifdef INVARIANTS
+ int cpu;
+ int64_t sum = 0;
+ CPU_FOREACH(cpu)
+ sum += *(int64_t*)zpcpu_get_cpu(pr->pr_pcpu_refs, cpu);
+ KASSERT(sum == 0, ("sum: %jd != 0", sum));
+#endif
+ return (pr);
+}
+
+void
+pcpu_ref_free(pcpu_ref_t pr)
+{
+ counter_u64_free(pr->pr_pcpu_refs);
+ free(pr, M_PCPU_REF);
+}
+
+void
+pcpu_ref_incr(pcpu_ref_t pr, int incr)
+{
+ epoch_enter(global_epoch);
+#ifdef INVARIANTS
+ int64_t sum = 0;
+ int refcount, cpu;
+
+ refcount = pr->pr_refcnt;
+ if (__predict_true((pr->pr_flags & PR_DYING) == 0)) {
+ CPU_FOREACH(cpu)
+ sum += *(int64_t*)zpcpu_get_cpu(pr->pr_pcpu_refs, cpu);
+ refcount -= OWNER_REFCOUNT-1;
+ }
+ KASSERT(sum + refcount > -2, ("sum: %jd + refcount: %d <= 0", sum, refcount));
+ if (sum + refcount <= 0) {
+ printf("sum: %jd + refcount: %d <= 0", sum, refcount);
+ kdb_backtrace();
+ }
+#endif
+ if (__predict_false(pr->pr_flags & PR_DYING))
+ atomic_add_int(&pr->pr_refcnt, incr);
+ else
+ *(int64_t*)zpcpu_get(pr->pr_pcpu_refs) += incr;
+ epoch_exit(global_epoch);
+}
+
+int
+pcpu_ref_decr(pcpu_ref_t pr, int decr)
+{
+ int rc, value;
+ epoch_enter(global_epoch);
+#ifdef INVARIANTS
+ int64_t sum = 0;
+ int cpu, refcount;
+
+ refcount = pr->pr_refcnt;
+ if (__predict_true((pr->pr_flags & PR_DYING) == 0)) {
+ CPU_FOREACH(cpu)
+ sum += *(int64_t*)zpcpu_get_cpu(pr->pr_pcpu_refs, cpu);
+ refcount -= OWNER_REFCOUNT-1;
+ }
+
+ KASSERT(sum + refcount >= decr, ("sum: %jd + refcount: %d < decr: %d",
+ sum, refcount, decr));
+#endif
+ rc = 0;
+ if (__predict_true((pr->pr_flags & PR_DYING) == 0))
+ *(int64_t*)zpcpu_get(pr->pr_pcpu_refs) -= decr;
+ else {
+ value = atomic_fetchadd_int(&pr->pr_refcnt, -decr);
+ MPASS(value >= decr);
+ if (value == decr)
+ rc = 1;
+ }
+ epoch_exit(global_epoch);
+ return (rc);
+}
+
+void
+pcpu_ref_kill(pcpu_ref_t pr)
+{
+ int cpu, sum, value;
+
+ MPASS((pr->pr_flags & PR_DYING) == 0);
+ pr->pr_flags |= PR_DYING;
+ epoch_wait(global_epoch);
+ sum = 0;
+ CPU_FOREACH(cpu)
+ sum += *(int64_t*)zpcpu_get_cpu(pr->pr_pcpu_refs, cpu);
+#ifdef INVARIANTS
+ KASSERT(sum + pr->pr_refcnt >= OWNER_REFCOUNT, ("sum: %d + pr_refcnt: %d < owner: %d",
+ sum, pr->pr_refcnt, OWNER_REFCOUNT));
+#endif
+
+ value = atomic_fetchadd_int(&pr->pr_refcnt, sum-OWNER_REFCOUNT+1);
+ MPASS(value + sum >= OWNER_REFCOUNT);
+}
Index: sys/mips/include/counter.h
===================================================================
--- sys/mips/include/counter.h
+++ sys/mips/include/counter.h
@@ -47,7 +47,7 @@
counter_u64_read_one(uint64_t *p, int cpu)
{
- return (*(uint64_t *)((char *)p + sizeof(struct pcpu) * cpu));
+ return (*(uint64_t *)((char *)p + UMA_PCPU_ZONE_SIZE * cpu));
}
static inline uint64_t
@@ -68,7 +68,7 @@
counter_u64_zero_one_cpu(void *arg)
{
- *((uint64_t *)((char *)arg + sizeof(struct pcpu) *
+ *((uint64_t *)((char *)arg + UMA_PCPU_ZONE_SIZE *
PCPU_GET(cpuid))) = 0;
}
Index: sys/powerpc/include/counter.h
===================================================================
--- sys/powerpc/include/counter.h
+++ sys/powerpc/include/counter.h
@@ -50,7 +50,7 @@
counter_u64_read_one(uint64_t *p, int cpu)
{
- return (*(uint64_t *)((char *)p + sizeof(struct pcpu) * cpu));
+ return (*(uint64_t *)((char *)p + UMA_PCPU_ZONE_SIZE * cpu));
}
static inline uint64_t
@@ -70,7 +70,7 @@
counter_u64_zero_one_cpu(void *arg)
{
- *((uint64_t *)((char *)arg + sizeof(struct pcpu) *
+ *((uint64_t *)((char *)arg + UMA_PCPU_ZONE_SIZE *
PCPU_GET(cpuid))) = 0;
}
@@ -113,7 +113,7 @@
counter_u64_read_one(uint64_t *p, int cpu)
{
- return (*(uint64_t *)((char *)p + sizeof(struct pcpu) * cpu));
+ return (*(uint64_t *)((char *)p + UMA_PCPU_ZONE_SIZE * cpu));
}
static inline uint64_t
@@ -134,7 +134,7 @@
counter_u64_zero_one_cpu(void *arg)
{
- *((uint64_t *)((char *)arg + sizeof(struct pcpu) *
+ *((uint64_t *)((char *)arg + UMA_PCPU_ZONE_SIZE *
PCPU_GET(cpuid))) = 0;
}
Index: sys/riscv/include/counter.h
===================================================================
--- sys/riscv/include/counter.h
+++ sys/riscv/include/counter.h
@@ -46,7 +46,7 @@
counter_u64_read_one(uint64_t *p, int cpu)
{
- return (*(uint64_t *)((char *)p + sizeof(struct pcpu) * cpu));
+ return (*(uint64_t *)((char *)p + UMA_PCPU_ZONE_SIZE * cpu));
}
static inline uint64_t
@@ -67,7 +67,7 @@
counter_u64_zero_one_cpu(void *arg)
{
- *((uint64_t *)((char *)arg + sizeof(struct pcpu) *
+ *((uint64_t *)((char *)arg + UMA_PCPU_ZONE_SIZE *
PCPU_GET(cpuid))) = 0;
}
Index: sys/sparc64/include/counter.h
===================================================================
--- sys/sparc64/include/counter.h
+++ sys/sparc64/include/counter.h
@@ -47,7 +47,7 @@
counter_u64_read_one(uint64_t *p, int cpu)
{
- return (*(uint64_t *)((char *)p + sizeof(struct pcpu) * cpu));
+ return (*(uint64_t *)((char *)p + UMA_PCPU_ZONE_SIZE * cpu));
}
static inline uint64_t
@@ -68,7 +68,7 @@
counter_u64_zero_one_cpu(void *arg)
{
- *((uint64_t *)((char *)arg + sizeof(struct pcpu) *
+ *((uint64_t *)((char *)arg + UMA_PCPU_ZONE_SIZE *
PCPU_GET(cpuid))) = 0;
}
Index: sys/sys/pcpu.h
===================================================================
--- sys/sys/pcpu.h
+++ sys/sys/pcpu.h
@@ -208,14 +208,16 @@
zpcpu_get(void *base)
{
- return ((char *)(base) + sizeof(struct pcpu) * curcpu);
+ /* UMA_PCPU_ZONE_SIZE == PAGE_SIZE */
+ return ((char *)(base) + PAGE_SIZE * curcpu);
}
static inline void *
zpcpu_get_cpu(void *base, int cpu)
{
- return ((char *)(base) + sizeof(struct pcpu) * cpu);
+ /* UMA_PCPU_ZONE_SIZE == PAGE_SIZE */
+ return ((char *)(base) + PAGE_SIZE * cpu);
}
/*
Index: sys/sys/pcpu_quota.h
===================================================================
--- /dev/null
+++ sys/sys/pcpu_quota.h
@@ -0,0 +1,42 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2018, Matthew Macy <mmacy@freebsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _SYS_PCPU_QUOTA_H_
+#define _SYS_PCPU_QUOTA_H_
+struct pcpu_quota;
+
+struct pcpu_quota *pcpu_quota_alloc(uintptr_t *global, uintptr_t pcpu_slop,
+ int (*alloc)(void *, uintptr_t, uintptr_t *), void *context, int flags);
+
+void pcpu_quota_cache_set(struct pcpu_quota *pq, int enable);
+void pcpu_quota_free(struct pcpu_quota *pq);
+int pcpu_quota_incr(struct pcpu_quota *pq, uintptr_t incr);
+void pcpu_quota_decr(struct pcpu_quota *pq, uintptr_t decr);
+
+#endif
Index: sys/sys/pcpu_refcount.h
===================================================================
--- /dev/null
+++ sys/sys/pcpu_refcount.h
@@ -0,0 +1,55 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2018, Matthew Macy <mmacy@freebsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _SYS_PCPU_REFCOUNT_H_
+#define _SYS_PCPU_REFCOUNT_H_
+
+
+struct pcpu_ref;
+typedef struct pcpu_ref *pcpu_ref_t;
+
+pcpu_ref_t pcpu_ref_alloc(int flags);
+void pcpu_ref_free(pcpu_ref_t pr);
+void pcpu_ref_incr(pcpu_ref_t pr, int incr);
+int pcpu_ref_decr(pcpu_ref_t pr, int decr);
+void pcpu_ref_kill(pcpu_ref_t pr);
+
+static inline void
+pcpu_ref_acquire(pcpu_ref_t pr)
+{
+ pcpu_ref_incr(pr, 1);
+}
+
+static inline int
+pcpu_ref_release(pcpu_ref_t pr)
+{
+ return (pcpu_ref_decr(pr, 1));
+}
+
+#endif
Index: sys/sys/resourcevar.h
===================================================================
--- sys/sys/resourcevar.h
+++ sys/sys/resourcevar.h
@@ -97,8 +97,8 @@
*/
struct uidinfo {
LIST_ENTRY(uidinfo) ui_hash; /* (c) hash chain of uidinfos */
- struct mtx ui_vmsize_mtx;
- vm_ooffset_t ui_vmsize; /* (d) swap reservation by uid */
+ vm_offset_t ui_vmsize; /* (d) swap reservation by uid */
+ struct pcpu_quota *ui_vmsize_pq;
long ui_sbsize; /* (b) socket buffer space consumed */
long ui_proccnt; /* (b) number of processes */
long ui_ptscnt; /* (b) number of pseudo-terminals */
Index: sys/sys/ucred.h
===================================================================
--- sys/sys/ucred.h
+++ sys/sys/ucred.h
@@ -48,8 +48,9 @@
* priv(9) interface should be used to check for privilege.
*/
#if defined(_KERNEL) || defined(_WANT_UCRED)
+#include <sys/pcpu_refcount.h>
struct ucred {
- u_int cr_ref; /* reference count */
+ pcpu_ref_t cr_pref; /* pcpu reference count */
#define cr_startcopy cr_uid
uid_t cr_uid; /* effective user id */
uid_t cr_ruid; /* real user id */
@@ -78,6 +79,8 @@
* Flags for cr_flags.
*/
#define CRED_FLAG_CAPMODE 0x00000001 /* In capability mode. */
+#define CRED_FLAG_ONSTACK 0x00000002 /* Stack allocated */
+#define CRED_FLAG_OWNED 0x00000004 /* Has an owner */
/*
* This is the external representation of struct ucred.
@@ -111,6 +114,7 @@
void proc_set_cred_init(struct proc *p, struct ucred *cr);
struct ucred *proc_set_cred(struct proc *p, struct ucred *cr);
void crfree(struct ucred *cr);
+void crdrop_owner(struct ucred *cr);
struct ucred *crget(void);
struct ucred *crhold(struct ucred *cr);
void cru2x(struct ucred *cr, struct xucred *xcr);
Index: sys/ufs/ufs/ufs_vnops.c
===================================================================
--- sys/ufs/ufs/ufs_vnops.c
+++ sys/ufs/ufs/ufs_vnops.c
@@ -1846,7 +1846,7 @@
* XXX This seems to never be accessed out of
* our context so a stack variable is ok.
*/
- refcount_init(&ucred.cr_ref, 1);
+ ucred.cr_flags = CRED_FLAG_ONSTACK;
ucred.cr_uid = ip->i_uid;
ucred.cr_ngroups = 1;
ucred.cr_groups = &ucred_group;
@@ -2610,7 +2610,7 @@
* XXX This seems to never be accessed out of our
* context so a stack variable is ok.
*/
- refcount_init(&ucred.cr_ref, 1);
+ ucred.cr_flags = CRED_FLAG_ONSTACK;
ucred.cr_uid = ip->i_uid;
ucred.cr_ngroups = 1;
ucred.cr_groups = &ucred_group;
Index: sys/vm/swap_pager.h
===================================================================
--- sys/vm/swap_pager.h
+++ sys/vm/swap_pager.h
@@ -76,6 +76,8 @@
#ifdef _KERNEL
extern int swap_pager_avail;
+extern vm_offset_t vmsize_max_pcpu_slop;
+extern vm_offset_t vmsize_max_slop;
struct xswdev;
int swap_dev_info(int name, struct xswdev *xs, char *devname, size_t len);
@@ -86,6 +88,7 @@
int swap_pager_nswapdev(void);
int swap_pager_reserve(vm_object_t, vm_pindex_t, vm_size_t);
void swap_pager_status(int *total, int *used);
+int swap_pager_vmsize_alloc(void *arg, vm_offset_t incr, vm_offset_t *slop);
void swapoff_all(void);
#endif /* _KERNEL */
Index: sys/vm/swap_pager.c
===================================================================
--- sys/vm/swap_pager.c
+++ sys/vm/swap_pager.c
@@ -88,6 +88,7 @@
#include <sys/namei.h>
#include <sys/vnode.h>
#include <sys/malloc.h>
+#include <sys/pcpu_quota.h>
#include <sys/pctrie.h>
#include <sys/racct.h>
#include <sys/resource.h>
@@ -98,6 +99,7 @@
#include <sys/sysproto.h>
#include <sys/blist.h>
#include <sys/lock.h>
+#include <sys/smp.h>
#include <sys/sx.h>
#include <sys/vmmeter.h>
@@ -154,9 +156,22 @@
static vm_ooffset_t swap_total;
SYSCTL_QUAD(_vm, OID_AUTO, swap_total, CTLFLAG_RD, &swap_total, 0,
"Total amount of available swap storage.");
-static vm_ooffset_t swap_reserved;
+static vm_offset_t swap_max_pcpu_slop;
+static vm_offset_t swap_max_slop;
+vm_offset_t vmsize_max_pcpu_slop;
+vm_offset_t vmsize_max_slop;
+static vm_offset_t swap_reserved;
+#ifdef __LP64__
+SYSCTL_QUAD(_vm, OID_AUTO, swap_max_slop, CTLFLAG_RD, &swap_max_slop, 0,
+ "maximum amount of slop in swap accounting.");
SYSCTL_QUAD(_vm, OID_AUTO, swap_reserved, CTLFLAG_RD, &swap_reserved, 0,
"Amount of swap storage needed to back all allocated anonymous memory.");
+#else
+SYSCTL_INT(_vm, OID_AUTO, swap_max_slop, CTLFLAG_RD, &swap_max_slop, 0,
+ "maximum amount of slop in swap accounting.");
+SYSCTL_INT(_vm, OID_AUTO, swap_reserved, CTLFLAG_RD, &swap_reserved, 0,
+ "Amount of swap storage needed to back all allocated anonymous memory.");
+#endif
static int overcommit = 0;
SYSCTL_INT(_vm, VM_OVERCOMMIT, overcommit, CTLFLAG_RW, &overcommit, 0,
"Configure virtual memory overcommit behavior. See tuning(7) "
@@ -173,18 +188,149 @@
#define SWAP_RESERVE_RLIMIT_ON (1 << 1)
#define SWAP_RESERVE_ALLOW_NONWIRED (1 << 2)
+#ifdef __LP64__
+#define atomic_fetchadd_uintptr atomic_fetchadd_long
+#define atomic_subtract_uintptr atomic_subtract_long
+#define atomic_add_uintptr atomic_add_long
+#else
+#define atomic_fetchadd_uintptr atomic_fetchadd_int
+#define atomic_subtract_uintptr atomic_subtract_int
+#define atomic_add_uintptr atomic_add_int
+#endif
+
+struct pcpu_quota *swap_reserve_pq;
int
-swap_reserve(vm_ooffset_t incr)
+swap_reserve(vm_offset_t incr)
{
return (swap_reserve_by_cred(incr, curthread->td_ucred));
}
+static int
+swap_alloc_can_cache(void)
+{
+ vm_offset_t s;
+
+ if ((overcommit & SWAP_RESERVE_FORCE_ON) == 0)
+ return (1);
+ if (overcommit & SWAP_RESERVE_ALLOW_NONWIRED) {
+ s = vm_cnt.v_page_count - vm_cnt.v_free_reserved -
+ vm_wire_count();
+ s *= PAGE_SIZE;
+ } else
+ s = 0;
+ s += swap_total;
+ if (__predict_true(2*swap_max_slop < swap_total - swap_reserved))
+ return (1);
+
+ return (0);
+}
+
+static int
+swap_alloc_slow(void *arg __unused, uintptr_t incr, uintptr_t *slop)
+{
+ vm_offset_t r, s, new, adj;
+ int res, can_cache;
+
+ can_cache = swap_alloc_can_cache();
+ adj = (swap_max_pcpu_slop>>1);
+ MPASS(*slop == 0);
+ if (can_cache) {
+ incr += adj;
+ } else if (__predict_false(swap_max_slop > swap_total - swap_reserved))
+ pcpu_quota_cache_set(swap_reserve_pq, 0);
+
+ res = 0;
+ new = atomic_fetchadd_uintptr(&swap_reserved, incr);
+ r = new + incr;
+ if (overcommit & SWAP_RESERVE_ALLOW_NONWIRED) {
+ s = vm_cnt.v_page_count - vm_cnt.v_free_reserved -
+ vm_wire_count();
+ s *= PAGE_SIZE;
+ } else
+ s = 0;
+ s += swap_total;
+ if ((overcommit & SWAP_RESERVE_FORCE_ON) == 0 || r <= s ||
+ priv_check(curthread, PRIV_VM_SWAP_NOQUOTA) == 0) {
+ res = 1;
+ *slop = can_cache*adj;
+ } else
+ atomic_subtract_uintptr(&swap_reserved, incr);
+
+ return (res);
+}
+
+static void
+swap_alloc_init(void *arg __unused)
+{
+ uint64_t slop_pages_pcpu;
+
+ slop_pages_pcpu = physmem / (8*mp_ncpus);
+ swap_max_pcpu_slop = slop_pages_pcpu*PAGE_SIZE;
+ swap_max_slop = swap_max_pcpu_slop*mp_ncpus;
+ vmsize_max_pcpu_slop = (slop_pages_pcpu >> 1)*PAGE_SIZE;
+ vmsize_max_slop = vmsize_max_pcpu_slop*mp_ncpus;
+ swap_reserve_pq = pcpu_quota_alloc(&swap_reserved, swap_max_pcpu_slop,
+ swap_alloc_slow, NULL, M_WAITOK);
+}
+SYSINIT(swap_alloc_init, SI_SUB_VM_CONF, SI_ORDER_ANY, swap_alloc_init, NULL);
+
+static int
+swap_alloc(vm_offset_t incr)
+{
+ return (pcpu_quota_incr(swap_reserve_pq, incr));
+}
+
+static void
+swap_free(vm_offset_t decr)
+{
+ return (pcpu_quota_decr(swap_reserve_pq, decr));
+}
+
int
-swap_reserve_by_cred(vm_ooffset_t incr, struct ucred *cred)
+swap_pager_vmsize_alloc(void *arg, uintptr_t incr, uintptr_t *slop)
{
- vm_ooffset_t r, s;
- int res, error;
+ struct uidinfo *uip;
+ int can_cache;
+ vm_offset_t new, adj;
+
+ uip = arg;
+ MPASS(*slop == 0);
+ adj = (vmsize_max_pcpu_slop >> 1);
+ if ((overcommit & SWAP_RESERVE_RLIMIT_ON) == 0) {
+ *slop = adj;
+ incr += adj;
+ atomic_add_uintptr(&uip->ui_vmsize, incr);
+ return (1);
+ }
+
+ if (__predict_false((overcommit & SWAP_RESERVE_RLIMIT_ON) &&
+ uip->ui_vmsize + swap_max_slop > lim_cur(curthread, RLIMIT_SWAP)))
+ pcpu_quota_cache_set(uip->ui_vmsize_pq, 0);
+ if ((overcommit & SWAP_RESERVE_RLIMIT_ON) != 0 &&
+ uip->ui_vmsize + incr > lim_cur(curthread, RLIMIT_SWAP) &&
+ priv_check(curthread, PRIV_VM_SWAP_NORLIMIT))
+ return (0);
+ can_cache = 0;
+ if ((overcommit & SWAP_RESERVE_RLIMIT_ON) == 0 ||
+ uip->ui_vmsize + 2*swap_max_slop < lim_cur(curthread, RLIMIT_SWAP))
+ can_cache = 1;
+
+ incr += can_cache*adj;
+ new = atomic_fetchadd_uintptr(&uip->ui_vmsize, incr);
+ if ((overcommit & SWAP_RESERVE_RLIMIT_ON) != 0 &&
+ new + incr > lim_cur(curthread, RLIMIT_SWAP)) {
+ atomic_subtract_uintptr(&uip->ui_vmsize, incr);
+ return (0);
+ }
+ *slop = can_cache*adj;
+ return (1);
+}
+
+int
+swap_reserve_by_cred(vm_offset_t incr, struct ucred *cred)
+{
+ int res;
static int curfail;
static struct timeval lastfail;
struct uidinfo *uip;
@@ -197,52 +343,26 @@
#ifdef RACCT
if (racct_enable) {
PROC_LOCK(curproc);
- error = racct_add(curproc, RACCT_SWAP, incr);
+ res = racct_add(curproc, RACCT_SWAP, incr);
PROC_UNLOCK(curproc);
- if (error != 0)
+ if (res != 0)
return (0);
}
#endif
- res = 0;
- mtx_lock(&sw_dev_mtx);
- r = swap_reserved + incr;
- if (overcommit & SWAP_RESERVE_ALLOW_NONWIRED) {
- s = vm_cnt.v_page_count - vm_cnt.v_free_reserved -
- vm_wire_count();
- s *= PAGE_SIZE;
- } else
- s = 0;
- s += swap_total;
- if ((overcommit & SWAP_RESERVE_FORCE_ON) == 0 || r <= s ||
- (error = priv_check(curthread, PRIV_VM_SWAP_NOQUOTA)) == 0) {
- res = 1;
- swap_reserved = r;
- }
- mtx_unlock(&sw_dev_mtx);
-
+ res = swap_alloc(incr);
if (res) {
- UIDINFO_VMSIZE_LOCK(uip);
- if ((overcommit & SWAP_RESERVE_RLIMIT_ON) != 0 &&
- uip->ui_vmsize + incr > lim_cur(curthread, RLIMIT_SWAP) &&
- priv_check(curthread, PRIV_VM_SWAP_NORLIMIT))
- res = 0;
- else
- uip->ui_vmsize += incr;
- UIDINFO_VMSIZE_UNLOCK(uip);
- if (!res) {
- mtx_lock(&sw_dev_mtx);
- swap_reserved -= incr;
- mtx_unlock(&sw_dev_mtx);
- }
+ res = pcpu_quota_incr(uip->ui_vmsize_pq, incr);
+ if (!res)
+ swap_free(incr);
}
if (!res && ppsratecheck(&lastfail, &curfail, 1)) {
printf("uid %d, pid %d: swap reservation for %jd bytes failed\n",
- uip->ui_uid, curproc->p_pid, incr);
+ uip->ui_uid, curproc->p_pid, (intmax_t)incr);
}
#ifdef RACCT
- if (!res) {
+ if (racct_enable && !res) {
PROC_LOCK(curproc);
racct_sub(curproc, RACCT_SWAP, incr);
PROC_UNLOCK(curproc);
@@ -253,41 +373,36 @@
}
void
-swap_reserve_force(vm_ooffset_t incr)
+swap_reserve_force(vm_offset_t incr)
{
struct uidinfo *uip;
- mtx_lock(&sw_dev_mtx);
- swap_reserved += incr;
- mtx_unlock(&sw_dev_mtx);
+ if (swap_alloc(incr) == 0)
+ atomic_add_uintptr(&swap_reserved, incr);
#ifdef RACCT
- PROC_LOCK(curproc);
- racct_add_force(curproc, RACCT_SWAP, incr);
- PROC_UNLOCK(curproc);
+ if (racct_enable) {
+ PROC_LOCK(curproc);
+ racct_add_force(curproc, RACCT_SWAP, incr);
+ PROC_UNLOCK(curproc);
+ }
#endif
uip = curthread->td_ucred->cr_ruidinfo;
- PROC_LOCK(curproc);
- UIDINFO_VMSIZE_LOCK(uip);
- uip->ui_vmsize += incr;
- UIDINFO_VMSIZE_UNLOCK(uip);
- PROC_UNLOCK(curproc);
+ atomic_add_uintptr(&uip->ui_vmsize, incr);
}
void
-swap_release(vm_ooffset_t decr)
+swap_release(vm_offset_t decr)
{
struct ucred *cred;
- PROC_LOCK(curproc);
cred = curthread->td_ucred;
swap_release_by_cred(decr, cred);
- PROC_UNLOCK(curproc);
}
void
-swap_release_by_cred(vm_ooffset_t decr, struct ucred *cred)
+swap_release_by_cred(vm_offset_t decr, struct ucred *cred)
{
struct uidinfo *uip;
@@ -296,19 +411,15 @@
if (decr & PAGE_MASK)
panic("swap_release: & PAGE_MASK");
- mtx_lock(&sw_dev_mtx);
if (swap_reserved < decr)
panic("swap_reserved < decr");
- swap_reserved -= decr;
- mtx_unlock(&sw_dev_mtx);
+ swap_free(decr);
- UIDINFO_VMSIZE_LOCK(uip);
if (uip->ui_vmsize < decr)
printf("negative vmsize for uid = %d\n", uip->ui_uid);
- uip->ui_vmsize -= decr;
- UIDINFO_VMSIZE_UNLOCK(uip);
-
- racct_sub_cred(cred, RACCT_SWAP, decr);
+ pcpu_quota_decr(uip->ui_vmsize_pq, decr);
+ if (racct_enable)
+ racct_sub_cred(cred, RACCT_SWAP, decr);
}
#define SWM_POP 0x01 /* pop out */
Index: sys/vm/uma.h
===================================================================
--- sys/vm/uma.h
+++ sys/vm/uma.h
@@ -44,6 +44,8 @@
/* User visible parameters */
#define UMA_SMALLEST_UNIT (PAGE_SIZE / 256) /* Smallest item allocated */
+#define UMA_PCPU_ZONE_SIZE PAGE_SIZE
+
/* Types and type defs */
struct uma_zone;
@@ -279,8 +281,7 @@
* mini-dumps.
*/
#define UMA_ZONE_PCPU 0x8000 /*
- * Allocates mp_maxid + 1 slabs sized to
- * sizeof(struct pcpu).
+ * Allocates mp_maxid + 1 slabs of PAGE_SIZE
*/
#define UMA_ZONE_NUMA 0x10000 /*
* NUMA aware Zone. Implements a best
Index: sys/vm/uma_core.c
===================================================================
--- sys/vm/uma_core.c
+++ sys/vm/uma_core.c
@@ -229,8 +229,10 @@
static void *noobj_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
static void *page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
+static void *pcpu_page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
static void *startup_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
static void page_free(void *, vm_size_t, uint8_t);
+static void pcpu_page_free(void *, vm_size_t, uint8_t);
static uma_slab_t keg_alloc_slab(uma_keg_t, uma_zone_t, int, int);
static void cache_drain(uma_zone_t);
static void bucket_drain(uma_zone_t, uma_bucket_t);
@@ -1172,6 +1174,54 @@
return (p);
}
+static void *
+pcpu_page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
+ int wait)
+{
+ TAILQ_HEAD(, vm_page) alloctail;
+ vm_offset_t addr, zkva;
+ struct pcpu *pc;
+ int cpu;
+ vm_page_t p, p_next;
+
+ TAILQ_INIT(&alloctail);
+ MPASS(bytes == (mp_maxid+1)*PAGE_SIZE);
+ *pflag = UMA_SLAB_KERNEL;
+
+ for (cpu = 0; cpu <= mp_maxid; cpu++) {
+ if (CPU_ABSENT(cpu)) {
+ p = vm_page_alloc_domain(NULL, 0, 0, VM_ALLOC_INTERRUPT |
+ VM_ALLOC_WIRED | VM_ALLOC_NOOBJ |
+ ((wait & M_WAITOK) != 0 ? VM_ALLOC_WAITOK :
+ VM_ALLOC_NOWAIT));
+
+ } else {
+ pc = pcpu_find(cpu);
+ p = vm_page_alloc_domain(NULL, 0, pc->pc_domain, VM_ALLOC_INTERRUPT |
+ VM_ALLOC_WIRED | VM_ALLOC_NOOBJ |
+ ((wait & M_WAITOK) != 0 ? VM_ALLOC_WAITOK :
+ VM_ALLOC_NOWAIT));
+ }
+ if (__predict_false(p == NULL))
+ goto fail;
+ TAILQ_INSERT_TAIL(&alloctail, p, listq);
+ }
+ if ((addr = kva_alloc(bytes)) == 0)
+ goto fail;
+ zkva = addr;
+ TAILQ_FOREACH(p, &alloctail, listq) {
+ pmap_qenter(zkva, &p, 1);
+ zkva += PAGE_SIZE;
+ }
+ return ((void*)addr);
+ fail:
+ TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) {
+ vm_page_unwire(p, PQ_NONE);
+ vm_page_free(p);
+ }
+ return (NULL);
+}
+
/*
* Allocates a number of pages from within an object
*
@@ -1257,6 +1307,37 @@
kmem_free(vmem, (vm_offset_t)mem, size);
}
+/*
+ * Frees pcpu zone allocations
+ *
+ * Arguments:
+ * mem A pointer to the memory to be freed
+ * size The size of the memory being freed
+ * flags The original p->us_flags field
+ *
+ * Returns:
+ * Nothing
+ */
+static void
+pcpu_page_free(void *mem, vm_size_t size, uint8_t flags)
+{
+ vm_offset_t sva, curva;
+ vm_paddr_t paddr;
+ vm_page_t m;
+
+ MPASS(size == (mp_maxid+1)*PAGE_SIZE);
+ sva = (vm_offset_t)mem;
+ for (curva = sva; curva < sva + size; curva += PAGE_SIZE) {
+ paddr = pmap_kextract(curva);
+ m = PHYS_TO_VM_PAGE(paddr);
+ vm_page_unwire(m, PQ_NONE);
+ vm_page_free(m);
+ }
+ pmap_qremove(sva, size >> PAGE_SHIFT);
+ kva_free(sva, size);
+}
+
+
/*
* Zero fill initializer
*
@@ -1290,9 +1371,8 @@
if (keg->uk_flags & UMA_ZONE_PCPU) {
u_int ncpus = (mp_maxid + 1) ? (mp_maxid + 1) : MAXCPU;
- slabsize = sizeof(struct pcpu);
- keg->uk_ppera = howmany(ncpus * sizeof(struct pcpu),
- PAGE_SIZE);
+ slabsize = PAGE_SIZE;
+ keg->uk_ppera = ncpus;
} else {
slabsize = UMA_SLAB_SIZE;
keg->uk_ppera = 1;
@@ -1311,7 +1391,7 @@
keg->uk_rsize = rsize;
KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0 ||
- keg->uk_rsize < sizeof(struct pcpu),
+ keg->uk_rsize < UMA_PCPU_ZONE_SIZE,
("%s: size %u too large", __func__, keg->uk_rsize));
if (keg->uk_flags & UMA_ZONE_OFFPAGE)
@@ -1529,6 +1609,8 @@
else if (keg->uk_ppera == 1)
keg->uk_allocf = uma_small_alloc;
#endif
+ else if (keg->uk_flags & UMA_ZONE_PCPU)
+ keg->uk_allocf = pcpu_page_alloc;
else
keg->uk_allocf = page_alloc;
#ifdef UMA_MD_SMALL_ALLOC
@@ -1536,6 +1618,9 @@
keg->uk_freef = uma_small_free;
else
#endif
+ if (keg->uk_flags & UMA_ZONE_PCPU)
+ keg->uk_freef = pcpu_page_free;
+ else
keg->uk_freef = page_free;
/*
Index: sys/vm/uma_int.h
===================================================================
--- sys/vm/uma_int.h
+++ sys/vm/uma_int.h
@@ -222,9 +222,8 @@
*
*/
struct uma_keg {
- struct mtx_padalign uk_lock; /* Lock for the keg */
+ struct mtx uk_lock; /* Lock for the keg */
struct uma_hash uk_hash;
-
LIST_HEAD(,uma_zone) uk_zones; /* Keg's zones */
uint32_t uk_cursor; /* Domain alloc cursor. */
@@ -315,41 +314,49 @@
*
*/
struct uma_zone {
- struct mtx_padalign uz_lock; /* Lock for the zone */
- struct mtx_padalign *uz_lockptr;
- const char *uz_name; /* Text name of the zone */
-
- LIST_ENTRY(uma_zone) uz_link; /* List of all zones in keg */
+ /* Offset 0, used in alloc/free fast/medium fast path and const. */
+ struct mtx *uz_lockptr;
+ const char *uz_name; /* Text name of the zone */
struct uma_zone_domain *uz_domain; /* per-domain buckets */
-
- LIST_HEAD(,uma_klink) uz_kegs; /* List of kegs. */
- struct uma_klink uz_klink; /* klink for first keg. */
-
- uma_slaballoc uz_slab; /* Allocate a slab from the backend. */
+ uint32_t uz_flags; /* Flags inherited from kegs */
+ uint32_t uz_size; /* Size inherited from kegs */
uma_ctor uz_ctor; /* Constructor for each allocation */
uma_dtor uz_dtor; /* Destructor */
uma_init uz_init; /* Initializer for each item */
uma_fini uz_fini; /* Finalizer for each item. */
+
+ /* Offset 64, used in bucket replenish. */
uma_import uz_import; /* Import new memory to cache. */
uma_release uz_release; /* Release memory from cache. */
void *uz_arg; /* Import/release argument. */
-
- uint32_t uz_flags; /* Flags inherited from kegs */
- uint32_t uz_size; /* Size inherited from kegs */
-
- volatile u_long uz_allocs UMA_ALIGN; /* Total number of allocations */
- volatile u_long uz_fails; /* Total number of alloc failures */
- volatile u_long uz_frees; /* Total number of frees */
- uint64_t uz_sleeps; /* Total number of alloc sleeps */
+ uma_slaballoc uz_slab; /* Allocate a slab from the backend. */
uint16_t uz_count; /* Amount of items in full bucket */
uint16_t uz_count_min; /* Minimal amount of items there */
+ /* 32bit pad on 64bit. */
+ LIST_ENTRY(uma_zone) uz_link; /* List of all zones in keg */
+ LIST_HEAD(,uma_klink) uz_kegs; /* List of kegs. */
+ /* Offset 128 Rare. */
+ /*
+ * The lock is placed here to avoid adjacent line prefetcher
+ * in fast paths and to take up space near infrequently accessed
+ * members to reduce alignment overhead.
+ */
+ struct mtx uz_lock; /* Lock for the zone */
+ struct uma_klink uz_klink; /* klink for first keg. */
/* The next two fields are used to print a rate-limited warnings. */
const char *uz_warning; /* Warning to print on failure */
struct timeval uz_ratecheck; /* Warnings rate-limiting */
-
struct task uz_maxaction; /* Task to run when at limit */
+ /* 16 bytes of pad. */
+
+ /* Offset 256, atomic stats. */
+ volatile u_long uz_allocs UMA_ALIGN; /* Total number of allocations */
+ volatile u_long uz_fails; /* Total number of alloc failures */
+ volatile u_long uz_frees; /* Total number of frees */
+ uint64_t uz_sleeps; /* Total number of alloc sleeps */
+
/*
* This HAS to be the last item because we adjust the zone size
* based on NCPU and then allocate the space for the zones.
Index: sys/vm/vm.h
===================================================================
--- sys/vm/vm.h
+++ sys/vm/vm.h
@@ -151,11 +151,11 @@
extern int vm_ndomains;
struct ucred;
-int swap_reserve(vm_ooffset_t incr);
-int swap_reserve_by_cred(vm_ooffset_t incr, struct ucred *cred);
-void swap_reserve_force(vm_ooffset_t incr);
-void swap_release(vm_ooffset_t decr);
-void swap_release_by_cred(vm_ooffset_t decr, struct ucred *cred);
+int swap_reserve(vm_offset_t incr);
+int swap_reserve_by_cred(vm_offset_t incr, struct ucred *cred);
+void swap_reserve_force(vm_offset_t incr);
+void swap_release(vm_offset_t decr);
+void swap_release_by_cred(vm_offset_t decr, struct ucred *cred);
void swapper(void);
#endif /* VM_H */
Index: sys/x86/acpica/srat.c
===================================================================
--- sys/x86/acpica/srat.c
+++ sys/x86/acpica/srat.c
@@ -517,12 +517,15 @@
static void
srat_set_cpus(void *dummy)
{
+#ifdef NUMA
struct cpu_info *cpu;
struct pcpu *pc;
u_int i;
+#endif
if (srat_physaddr == 0)
return;
+#ifdef NUMA
for (i = 0; i < MAXCPU; i++) {
if (CPU_ABSENT(i))
continue;
@@ -538,7 +541,7 @@
printf("SRAT: CPU %u has memory domain %d\n", i,
cpu->domain);
}
-
+#endif
/* Last usage of the cpus array, unmap it. */
pmap_unmapbios((vm_offset_t)cpus, sizeof(*cpus) * (max_apic_id + 1));
cpus = NULL;

File Metadata

Mime Type
text/plain
Expires
Thu, Mar 12, 2:16 AM (14 h, 59 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
29560670
Default Alt Text
D15975.diff (50 KB)

Event Timeline