Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F147446042
D15975.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
50 KB
Referenced Files
None
Subscribers
None
D15975.diff
View Options
Index: sys/amd64/amd64/pmap.c
===================================================================
--- sys/amd64/amd64/pmap.c
+++ sys/amd64/amd64/pmap.c
@@ -114,6 +114,7 @@
#include <sys/bitstring.h>
#include <sys/bus.h>
#include <sys/systm.h>
+#include <sys/epoch.h>
#include <sys/kernel.h>
#include <sys/ktr.h>
#include <sys/lock.h>
@@ -349,6 +350,7 @@
vm_paddr_t dmaplimit;
vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS;
pt_entry_t pg_nx;
+static epoch_t pmap_epoch;
static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
@@ -438,14 +440,12 @@
CTLFLAG_MPSAFE, NULL, 0, pmap_pcid_save_cnt_proc, "QU",
"Count of saved TLB context on switch");
-static LIST_HEAD(, pmap_invl_gen) pmap_invl_gen_tracker =
- LIST_HEAD_INITIALIZER(&pmap_invl_gen_tracker);
-static struct mtx invl_gen_mtx;
-static u_long pmap_invl_gen = 0;
-/* Fake lock object to satisfy turnstiles interface. */
-static struct lock_object invl_gen_ts = {
- .lo_name = "invlts",
-};
+static void
+pmap_epoch_init(void *arg __unused)
+{
+ pmap_epoch = epoch_alloc(EPOCH_PREEMPT);
+}
+SYSINIT(epoch, SI_SUB_TASKQ + 1, SI_ORDER_ANY, pmap_epoch_init, NULL);
static bool
pmap_not_in_di(void)
@@ -468,19 +468,8 @@
static void
pmap_delayed_invl_started(void)
{
- struct pmap_invl_gen *invl_gen;
- u_long currgen;
-
- invl_gen = &curthread->td_md.md_invl_gen;
- PMAP_ASSERT_NOT_IN_DI();
- mtx_lock(&invl_gen_mtx);
- if (LIST_EMPTY(&pmap_invl_gen_tracker))
- currgen = pmap_invl_gen;
- else
- currgen = LIST_FIRST(&pmap_invl_gen_tracker)->gen;
- invl_gen->gen = currgen + 1;
- LIST_INSERT_HEAD(&pmap_invl_gen_tracker, invl_gen, link);
- mtx_unlock(&invl_gen_mtx);
+ epoch_enter_preempt(pmap_epoch);
+ curthread->td_md.md_invl_gen.gen = 1;
}
/*
@@ -500,28 +489,8 @@
static void
pmap_delayed_invl_finished(void)
{
- struct pmap_invl_gen *invl_gen, *next;
- struct turnstile *ts;
-
- invl_gen = &curthread->td_md.md_invl_gen;
- KASSERT(invl_gen->gen != 0, ("missed invl_started"));
- mtx_lock(&invl_gen_mtx);
- next = LIST_NEXT(invl_gen, link);
- if (next == NULL) {
- turnstile_chain_lock(&invl_gen_ts);
- ts = turnstile_lookup(&invl_gen_ts);
- pmap_invl_gen = invl_gen->gen;
- if (ts != NULL) {
- turnstile_broadcast(ts, TS_SHARED_QUEUE);
- turnstile_unpend(ts);
- }
- turnstile_chain_unlock(&invl_gen_ts);
- } else {
- next->gen = invl_gen->gen;
- }
- LIST_REMOVE(invl_gen, link);
- mtx_unlock(&invl_gen_mtx);
- invl_gen->gen = 0;
+ curthread->td_md.md_invl_gen.gen = 0;
+ epoch_exit_preempt(pmap_epoch);
}
#ifdef PV_STATS
@@ -544,36 +513,14 @@
* pmap_delayed_invl_wait(), upon its return we know that no CPU has a
* valid mapping for the page m in either its page table or TLB.
*
- * This function works by blocking until the global DI generation
- * number catches up with the generation number associated with the
- * given page m and its PV list. Since this function's callers
- * typically own an object lock and sometimes own a page lock, it
- * cannot sleep. Instead, it blocks on a turnstile to relinquish the
- * processor.
+ * This function works by checking that there are either no callers
+ * within a DI block or if there are that a grace period elapses for
+ * any callers in an epoch section when it is initially called.
*/
static void
pmap_delayed_invl_wait(vm_page_t m)
{
- struct turnstile *ts;
- u_long *m_gen;
-#ifdef PV_STATS
- bool accounted = false;
-#endif
-
- m_gen = pmap_delayed_invl_genp(m);
- while (*m_gen > pmap_invl_gen) {
-#ifdef PV_STATS
- if (!accounted) {
- atomic_add_long(&invl_wait, 1);
- accounted = true;
- }
-#endif
- ts = turnstile_trywait(&invl_gen_ts);
- if (*m_gen > pmap_invl_gen)
- turnstile_wait(ts, NULL, TS_SHARED_QUEUE);
- else
- turnstile_cancel(ts);
- }
+ epoch_wait_preempt(pmap_epoch);
}
/*
@@ -1130,11 +1077,6 @@
TAILQ_INIT(&kernel_pmap->pm_pvchunk);
kernel_pmap->pm_flags = pmap_flags;
- /*
- * Initialize the TLB invalidations generation number lock.
- */
- mtx_init(&invl_gen_mtx, "invlgn", NULL, MTX_DEF);
-
/*
* Reserve some special page table entries/VA space for temporary
* mapping of pages.
Index: sys/amd64/include/counter.h
===================================================================
--- sys/amd64/include/counter.h
+++ sys/amd64/include/counter.h
@@ -45,7 +45,7 @@
counter_u64_read_one(uint64_t *p, int cpu)
{
- return (*(uint64_t *)((char *)p + sizeof(struct pcpu) * cpu));
+ return (*(uint64_t *)((char *)p + UMA_PCPU_ZONE_SIZE * cpu));
}
static inline uint64_t
@@ -65,7 +65,7 @@
counter_u64_zero_one_cpu(void *arg)
{
- *((uint64_t *)((char *)arg + sizeof(struct pcpu) *
+ *((uint64_t *)((char *)arg + UMA_PCPU_ZONE_SIZE *
PCPU_GET(cpuid))) = 0;
}
Index: sys/arm/include/counter.h
===================================================================
--- sys/arm/include/counter.h
+++ sys/arm/include/counter.h
@@ -47,7 +47,7 @@
counter_u64_read_one(uint64_t *p, int cpu)
{
- return (atomic_load_64((uint64_t *)((char *)p + sizeof(struct pcpu) *
+ return (atomic_load_64((uint64_t *)((char *)p + UMA_PCPU_ZONE_SIZE *
cpu)));
}
@@ -68,7 +68,7 @@
counter_u64_zero_one_cpu(void *arg)
{
- atomic_store_64((uint64_t *)((char *)arg + sizeof(struct pcpu) *
+ atomic_store_64((uint64_t *)((char *)arg + UMA_PCPU_ZONE_SIZE *
PCPU_GET(cpuid)), 0);
}
Index: sys/arm64/include/counter.h
===================================================================
--- sys/arm64/include/counter.h
+++ sys/arm64/include/counter.h
@@ -44,7 +44,7 @@
counter_u64_read_one(uint64_t *p, int cpu)
{
- return (*(uint64_t *)((char *)p + sizeof(struct pcpu) * cpu));
+ return (*(uint64_t *)((char *)p + UMA_PCPU_ZONE_SIZE * cpu));
}
static inline uint64_t
@@ -64,7 +64,7 @@
counter_u64_zero_one_cpu(void *arg)
{
- *((uint64_t *)((char *)arg + sizeof(struct pcpu) *
+ *((uint64_t *)((char *)arg + UMA_PCPU_ZONE_SIZE *
PCPU_GET(cpuid))) = 0;
}
Index: sys/conf/files
===================================================================
--- sys/conf/files
+++ sys/conf/files
@@ -3894,6 +3894,8 @@
kern/subr_msgbuf.c standard
kern/subr_param.c standard
kern/subr_pcpu.c standard
+kern/subr_pcpu_quota.c standard
+kern/subr_pcpu_refcount.c standard
kern/subr_pctrie.c standard
kern/subr_pidctrl.c standard
kern/subr_power.c standard
Index: sys/i386/include/counter.h
===================================================================
--- sys/i386/include/counter.h
+++ sys/i386/include/counter.h
@@ -104,13 +104,13 @@
critical_enter();
CPU_FOREACH(i) {
res += *(uint64_t *)((char *)p +
- sizeof(struct pcpu) * i);
+ UMA_PCPU_ZONE_SIZE * i);
}
critical_exit();
} else {
CPU_FOREACH(i)
res += counter_u64_read_one_8b((uint64_t *)((char *)p +
- sizeof(struct pcpu) * i));
+ UMA_PCPU_ZONE_SIZE * i));
}
return (res);
}
@@ -137,7 +137,7 @@
{
uint64_t *p;
- p = (uint64_t *)((char *)arg + sizeof(struct pcpu) * PCPU_GET(cpuid));
+ p = (uint64_t *)((char *)arg + UMA_PCPU_ZONE_SIZE * PCPU_GET(cpuid));
counter_u64_zero_one_8b(p);
}
@@ -149,7 +149,7 @@
if ((cpu_feature & CPUID_CX8) == 0) {
critical_enter();
CPU_FOREACH(i)
- *(uint64_t *)((char *)c + sizeof(struct pcpu) * i) = 0;
+ *(uint64_t *)((char *)c + UMA_PCPU_ZONE_SIZE * i) = 0;
critical_exit();
} else {
smp_rendezvous(smp_no_rendezvous_barrier,
Index: sys/kern/kern_prot.c
===================================================================
--- sys/kern/kern_prot.c
+++ sys/kern/kern_prot.c
@@ -1829,7 +1829,8 @@
struct ucred *cr;
cr = malloc(sizeof(*cr), M_CRED, M_WAITOK | M_ZERO);
- refcount_init(&cr->cr_ref, 1);
+ cr->cr_pref = pcpu_ref_alloc(M_WAITOK);
+
#ifdef AUDIT
audit_cred_init(cr);
#endif
@@ -1848,11 +1849,19 @@
struct ucred *
crhold(struct ucred *cr)
{
+ if (cr->cr_flags & CRED_FLAG_ONSTACK)
+ return (cr);
- refcount_acquire(&cr->cr_ref);
+ pcpu_ref_acquire(cr->cr_pref);
return (cr);
}
+void
+crdrop_owner(struct ucred *cr)
+{
+ pcpu_ref_kill(cr->cr_pref);
+}
+
/*
* Free a cred structure. Throws away space when ref count gets to 0.
*/
@@ -1860,9 +1869,12 @@
crfree(struct ucred *cr)
{
- KASSERT(cr->cr_ref > 0, ("bad ucred refcount: %d", cr->cr_ref));
- KASSERT(cr->cr_ref != 0xdeadc0de, ("dangling reference to ucred"));
- if (refcount_release(&cr->cr_ref)) {
+ KASSERT((unsigned int)((uintptr_t)cr->cr_pref) != 0xdeadc0de, ("cr: %p dangling reference to ucred", cr));
+ if (cr->cr_flags & CRED_FLAG_ONSTACK)
+ return;
+
+ if (pcpu_ref_release(cr->cr_pref)) {
+ pcpu_ref_free(cr->cr_pref);
/*
* Some callers of crget(), such as nfs_statfs(),
* allocate a temporary credential, but don't
@@ -1898,7 +1910,7 @@
crcopy(struct ucred *dest, struct ucred *src)
{
- KASSERT(dest->cr_ref == 1, ("crcopy of shared ucred"));
+ //KASSERT(dest->cr_ref == 1, ("crcopy of shared ucred"));
bcopy(&src->cr_startcopy, &dest->cr_startcopy,
(unsigned)((caddr_t)&src->cr_endcopy -
(caddr_t)&src->cr_startcopy));
@@ -1913,6 +1925,7 @@
#ifdef MAC
mac_cred_copy(src, dest);
#endif
+ dest->cr_flags &= ~(CRED_FLAG_ONSTACK|CRED_FLAG_OWNED);
}
/*
@@ -1953,7 +1966,16 @@
void
proc_set_cred_init(struct proc *p, struct ucred *newcred)
{
-
+ struct ucred *dupcred;
+#ifdef notyet
+ if (newcred->cr_flags & CRED_FLAG_OWNED)
+#endif
+ {
+ dupcred = crdup(newcred);
+ crfree(newcred);
+ newcred = dupcred;
+ }
+ newcred->cr_flags |= CRED_FLAG_OWNED;
p->p_ucred = newcred;
}
@@ -1975,10 +1997,23 @@
MPASS(p->p_ucred != NULL);
if (newcred == NULL)
MPASS(p->p_state == PRS_ZOMBIE);
- else
+ else {
+#ifdef notyet
+ if (newcred->cr_flags & CRED_FLAG_OWNED)
+ {
+ oldcred = crdup(newcred);
+ crfree(newcred);
+ newcred = oldcred;
+ newcred->cr_flags |= CRED_FLAG_OWNED;
+ }
+#endif
+ MPASS((newcred->cr_flags & CRED_FLAG_OWNED) == 0);
+ newcred->cr_flags |= CRED_FLAG_OWNED;
PROC_LOCK_ASSERT(p, MA_OWNED);
-
+ }
oldcred = p->p_ucred;
+ crdrop_owner(oldcred);
+
p->p_ucred = newcred;
if (newcred != NULL)
PROC_UPDATE_COW(p);
@@ -2002,7 +2037,6 @@
oldcred = p->p_ucred;
}
crcopy(cr, oldcred);
-
return (oldcred);
}
Index: sys/kern/kern_resource.c
===================================================================
--- sys/kern/kern_resource.c
+++ sys/kern/kern_resource.c
@@ -47,6 +47,7 @@
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
+#include <sys/pcpu_quota.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/refcount.h>
@@ -65,6 +66,7 @@
#include <vm/vm_param.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
+#include <vm/swap_pager.h>
static MALLOC_DEFINE(M_PLIMIT, "plimit", "plimit structures");
@@ -1244,6 +1246,7 @@
return (uip);
}
+
/*
* Find or allocate a struct uidinfo for a particular uid.
* Returns with uidinfo struct referenced.
@@ -1276,7 +1279,8 @@
racct_create(&new_uip->ui_racct);
refcount_init(&new_uip->ui_ref, 1);
new_uip->ui_uid = uid;
- mtx_init(&new_uip->ui_vmsize_mtx, "ui_vmsize", NULL, MTX_DEF);
+ new_uip->ui_vmsize_pq = pcpu_quota_alloc(&new_uip->ui_vmsize,
+ vmsize_max_pcpu_slop, swap_pager_vmsize_alloc, new_uip, M_WAITOK);
rw_wlock(&uihashtbl_lock);
/*
@@ -1291,7 +1295,6 @@
} else {
rw_wunlock(&uihashtbl_lock);
racct_destroy(&new_uip->ui_racct);
- mtx_destroy(&new_uip->ui_vmsize_mtx);
free(new_uip, M_UIDINFO);
}
return (uip);
@@ -1343,6 +1346,7 @@
LIST_REMOVE(uip, ui_hash);
rw_wunlock(&uihashtbl_lock);
+ pcpu_quota_cache_set(uip->ui_vmsize_pq, 0);
if (uip->ui_sbsize != 0)
printf("freeing uidinfo: uid = %d, sbsize = %ld\n",
uip->ui_uid, uip->ui_sbsize);
@@ -1352,7 +1356,6 @@
if (uip->ui_vmsize != 0)
printf("freeing uidinfo: uid = %d, swapuse = %lld\n",
uip->ui_uid, (unsigned long long)uip->ui_vmsize);
- mtx_destroy(&uip->ui_vmsize_mtx);
free(uip, M_UIDINFO);
}
Index: sys/kern/subr_counter.c
===================================================================
--- sys/kern/subr_counter.c
+++ sys/kern/subr_counter.c
@@ -50,6 +50,15 @@
counter_u64_zero_inline(c);
}
+static void
+counter_u64_zero_sync(counter_u64_t c)
+{
+ int cpu;
+
+ CPU_FOREACH(cpu)
+ *(uint64_t*)zpcpu_get_cpu(c, cpu) = 0;
+}
+
uint64_t
counter_u64_fetch(counter_u64_t c)
{
@@ -64,7 +73,7 @@
r = uma_zalloc_pcpu(pcpu_zone_64, flags);
if (r != NULL)
- counter_u64_zero(r);
+ counter_u64_zero_sync(r);
return (r);
}
Index: sys/kern/subr_pcpu.c
===================================================================
--- sys/kern/subr_pcpu.c
+++ sys/kern/subr_pcpu.c
@@ -75,8 +75,8 @@
static DPCPU_DEFINE(char, modspace[DPCPU_MODMIN]);
static TAILQ_HEAD(, dpcpu_free) dpcpu_head = TAILQ_HEAD_INITIALIZER(dpcpu_head);
static struct sx dpcpu_lock;
-uintptr_t dpcpu_off[MAXCPU];
-struct pcpu *cpuid_to_pcpu[MAXCPU];
+__read_mostly uintptr_t dpcpu_off[MAXCPU];
+__read_mostly struct pcpu *cpuid_to_pcpu[MAXCPU];
struct cpuhead cpuhead = STAILQ_HEAD_INITIALIZER(cpuhead);
/*
Index: sys/kern/subr_pcpu_quota.c
===================================================================
--- /dev/null
+++ sys/kern/subr_pcpu_quota.c
@@ -0,0 +1,184 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2018, Matthew Macy <mmacy@freebsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/epoch.h>
+#include <sys/systm.h>
+#include <sys/counter.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/pcpu_quota.h>
+#include <sys/smp.h>
+#include <vm/uma.h>
+
+#include <ck_pr.h>
+
+static MALLOC_DEFINE(M_PCPU_QUOTA, "Per-cpu", "Per-cpu resource accounting.");
+
+#define PCPU_QUOTA_SLOP_GET(p) zpcpu_get((p)->pq_slop)
+#define PCPU_QUOTA_CAN_CACHE 0x1
+#define PCPU_QUOTA_FLUSHING 0x2
+
+struct pcpu_quota {
+ void *pq_context;
+ counter_u64_t pq_slop;
+ uintptr_t *pq_global;
+ uintptr_t pq_pcpu_slop;
+ int (*pq_alloc)(void *context, uintptr_t incr, uintptr_t *slop);
+ volatile int pq_flags;
+} __aligned(CACHE_LINE_SIZE);
+
+
+#ifdef __LP64__
+#define atomic_subtract_uintptr atomic_subtract_long
+#else
+#define atomic_subtract_uintptr atomic_subtract_int
+#endif
+
+static void
+pcpu_quota_flush(struct pcpu_quota *pq)
+{
+ int64_t *p;
+ uintptr_t value;
+ int cpu;
+
+ value = 0;
+ epoch_enter(global_epoch);
+ CPU_FOREACH(cpu) {
+ p = zpcpu_get_cpu(pq->pq_slop, cpu);
+ MPASS(*p >= 0);
+ value += *p;
+ *p = 0;
+ }
+ if (value)
+ atomic_subtract_uintptr(pq->pq_global, value);
+ epoch_exit(global_epoch);
+}
+
+void
+pcpu_quota_cache_set(struct pcpu_quota *pq, int enable)
+{
+ int *flagsp;
+
+ flagsp = (int *)(uintptr_t)&pq->pq_flags;
+ if (!enable && (pq->pq_flags & PCPU_QUOTA_CAN_CACHE)) {
+ if (ck_pr_btr_int(flagsp, PCPU_QUOTA_CAN_CACHE) == 0 &&
+ ck_pr_bts_int(flagsp, PCPU_QUOTA_FLUSHING) == 0) {
+ epoch_wait(global_epoch);
+ pcpu_quota_flush(pq);
+ ck_pr_btr_int(flagsp, PCPU_QUOTA_FLUSHING);
+ }
+ } else if (enable && (pq->pq_flags & PCPU_QUOTA_CAN_CACHE) == 0) {
+ while (pq->pq_flags & PCPU_QUOTA_FLUSHING)
+ cpu_spinwait();
+ ck_pr_bts_int(flagsp, PCPU_QUOTA_CAN_CACHE);
+ }
+}
+
+struct pcpu_quota *
+pcpu_quota_alloc(uintptr_t *global, uintptr_t pcpu_slop,
+ int (*alloc)(void *, uintptr_t, uintptr_t*), void *context, int flags)
+{
+ struct pcpu_quota *pq;
+
+ flags &= ~M_ZERO;
+ if ((pq = malloc(sizeof(*pq), M_PCPU_QUOTA, flags)) == NULL)
+ return (NULL);
+ if ((pq->pq_slop = counter_u64_alloc(flags)) == NULL) {
+ free(pq, M_PCPU_QUOTA);
+ return (NULL);
+ }
+ pq->pq_pcpu_slop = pcpu_slop;
+ pq->pq_context = context;
+ pq->pq_global = global;
+ pq->pq_alloc = alloc;
+ pq->pq_flags = PCPU_QUOTA_CAN_CACHE;
+ return (pq);
+}
+
+void
+pcpu_quota_free(struct pcpu_quota *pq)
+{
+ counter_u64_free(pq->pq_slop);
+ free(pq, M_PCPU_QUOTA);
+}
+
+int
+pcpu_quota_incr(struct pcpu_quota *pq, uintptr_t incr)
+{
+ int64_t *p;
+ int rc;
+
+ epoch_enter(global_epoch);
+ p = PCPU_QUOTA_SLOP_GET(pq);
+ if (*p >= incr) {
+ *p -= incr;
+ epoch_exit(global_epoch);
+ return (1);
+ }
+ incr -= *p;
+ *p = 0;
+ rc = pq->pq_alloc(pq->pq_context, incr, (uintptr_t *)p);
+ if ( __predict_false((pq->pq_flags & PCPU_QUOTA_CAN_CACHE) == 0) && *p > 0)
+ pcpu_quota_cache_set(pq, 1);
+
+ epoch_exit(global_epoch);
+ return (rc);
+}
+
+void
+pcpu_quota_decr(struct pcpu_quota *pq, uintptr_t decr)
+{
+ int64_t *p;
+ int64_t value;
+ long adj;
+
+ epoch_enter(global_epoch);
+ p = PCPU_QUOTA_SLOP_GET(pq);
+ if (__predict_true(pq->pq_flags & PCPU_QUOTA_CAN_CACHE)) {
+ if (*p + decr <= pq->pq_pcpu_slop) {
+ *p += decr;
+ epoch_exit(global_epoch);
+ return;
+ }
+ adj = (pq->pq_pcpu_slop >> 1);
+ value = decr + (*p - adj);
+ } else {
+ adj = 0;
+ value = *p + decr;
+ }
+ MPASS(value > 0);
+ *p = adj;
+ atomic_subtract_uintptr(pq->pq_global, (uintptr_t)value);
+ epoch_exit(global_epoch);
+}
+
Index: sys/kern/subr_pcpu_refcount.c
===================================================================
--- /dev/null
+++ sys/kern/subr_pcpu_refcount.c
@@ -0,0 +1,164 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2018, Matthew Macy <mmacy@freebsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/epoch.h>
+#include <sys/systm.h>
+#include <sys/counter.h>
+#include <sys/kdb.h>
+#include <sys/kernel.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/pcpu_refcount.h>
+#include <sys/smp.h>
+
+static MALLOC_DEFINE(M_PCPU_REF, "Pcpuref", "Per-cpu reference counting.");
+#define PR_DYING 0x1
+
+#define OWNER_REFCOUNT (INT_MAX >> 2)
+
+
+struct pcpu_ref {
+ counter_u64_t pr_pcpu_refs;
+ volatile int pr_refcnt;
+ int pr_flags;
+} __aligned(CACHE_LINE_SIZE);
+
+pcpu_ref_t
+pcpu_ref_alloc(int flags)
+{
+ pcpu_ref_t pr;
+
+ pr = malloc(sizeof(*pr), M_PCPU_REF, flags);
+ if (pr == NULL)
+ return (NULL);
+ if ((pr->pr_pcpu_refs = counter_u64_alloc(flags)) == NULL) {
+ free(pr, M_PCPU_REF);
+ return (NULL);
+ }
+ pr->pr_flags = 0;
+ pr->pr_refcnt = OWNER_REFCOUNT;
+#ifdef INVARIANTS
+ int cpu;
+ int64_t sum = 0;
+ CPU_FOREACH(cpu)
+ sum += *(int64_t*)zpcpu_get_cpu(pr->pr_pcpu_refs, cpu);
+ KASSERT(sum == 0, ("sum: %jd != 0", sum));
+#endif
+ return (pr);
+}
+
+void
+pcpu_ref_free(pcpu_ref_t pr)
+{
+ counter_u64_free(pr->pr_pcpu_refs);
+ free(pr, M_PCPU_REF);
+}
+
+void
+pcpu_ref_incr(pcpu_ref_t pr, int incr)
+{
+ epoch_enter(global_epoch);
+#ifdef INVARIANTS
+ int64_t sum = 0;
+ int refcount, cpu;
+
+ refcount = pr->pr_refcnt;
+ if (__predict_true((pr->pr_flags & PR_DYING) == 0)) {
+ CPU_FOREACH(cpu)
+ sum += *(int64_t*)zpcpu_get_cpu(pr->pr_pcpu_refs, cpu);
+ refcount -= OWNER_REFCOUNT-1;
+ }
+ KASSERT(sum + refcount > -2, ("sum: %jd + refcount: %d <= 0", sum, refcount));
+ if (sum + refcount <= 0) {
+ printf("sum: %jd + refcount: %d <= 0", sum, refcount);
+ kdb_backtrace();
+ }
+#endif
+ if (__predict_false(pr->pr_flags & PR_DYING))
+ atomic_add_int(&pr->pr_refcnt, incr);
+ else
+ *(int64_t*)zpcpu_get(pr->pr_pcpu_refs) += incr;
+ epoch_exit(global_epoch);
+}
+
+int
+pcpu_ref_decr(pcpu_ref_t pr, int decr)
+{
+ int rc, value;
+ epoch_enter(global_epoch);
+#ifdef INVARIANTS
+ int64_t sum = 0;
+ int cpu, refcount;
+
+ refcount = pr->pr_refcnt;
+ if (__predict_true((pr->pr_flags & PR_DYING) == 0)) {
+ CPU_FOREACH(cpu)
+ sum += *(int64_t*)zpcpu_get_cpu(pr->pr_pcpu_refs, cpu);
+ refcount -= OWNER_REFCOUNT-1;
+ }
+
+ KASSERT(sum + refcount >= decr, ("sum: %jd + refcount: %d < decr: %d",
+ sum, refcount, decr));
+#endif
+ rc = 0;
+ if (__predict_true((pr->pr_flags & PR_DYING) == 0))
+ *(int64_t*)zpcpu_get(pr->pr_pcpu_refs) -= decr;
+ else {
+ value = atomic_fetchadd_int(&pr->pr_refcnt, -decr);
+ MPASS(value >= decr);
+ if (value == decr)
+ rc = 1;
+ }
+ epoch_exit(global_epoch);
+ return (rc);
+}
+
+void
+pcpu_ref_kill(pcpu_ref_t pr)
+{
+ int cpu, sum, value;
+
+ MPASS((pr->pr_flags & PR_DYING) == 0);
+ pr->pr_flags |= PR_DYING;
+ epoch_wait(global_epoch);
+ sum = 0;
+ CPU_FOREACH(cpu)
+ sum += *(int64_t*)zpcpu_get_cpu(pr->pr_pcpu_refs, cpu);
+#ifdef INVARIANTS
+ KASSERT(sum + pr->pr_refcnt >= OWNER_REFCOUNT, ("sum: %d + pr_refcnt: %d < owner: %d",
+ sum, pr->pr_refcnt, OWNER_REFCOUNT));
+#endif
+
+ value = atomic_fetchadd_int(&pr->pr_refcnt, sum-OWNER_REFCOUNT+1);
+ MPASS(value + sum >= OWNER_REFCOUNT);
+}
Index: sys/mips/include/counter.h
===================================================================
--- sys/mips/include/counter.h
+++ sys/mips/include/counter.h
@@ -47,7 +47,7 @@
counter_u64_read_one(uint64_t *p, int cpu)
{
- return (*(uint64_t *)((char *)p + sizeof(struct pcpu) * cpu));
+ return (*(uint64_t *)((char *)p + UMA_PCPU_ZONE_SIZE * cpu));
}
static inline uint64_t
@@ -68,7 +68,7 @@
counter_u64_zero_one_cpu(void *arg)
{
- *((uint64_t *)((char *)arg + sizeof(struct pcpu) *
+ *((uint64_t *)((char *)arg + UMA_PCPU_ZONE_SIZE *
PCPU_GET(cpuid))) = 0;
}
Index: sys/powerpc/include/counter.h
===================================================================
--- sys/powerpc/include/counter.h
+++ sys/powerpc/include/counter.h
@@ -50,7 +50,7 @@
counter_u64_read_one(uint64_t *p, int cpu)
{
- return (*(uint64_t *)((char *)p + sizeof(struct pcpu) * cpu));
+ return (*(uint64_t *)((char *)p + UMA_PCPU_ZONE_SIZE * cpu));
}
static inline uint64_t
@@ -70,7 +70,7 @@
counter_u64_zero_one_cpu(void *arg)
{
- *((uint64_t *)((char *)arg + sizeof(struct pcpu) *
+ *((uint64_t *)((char *)arg + UMA_PCPU_ZONE_SIZE *
PCPU_GET(cpuid))) = 0;
}
@@ -113,7 +113,7 @@
counter_u64_read_one(uint64_t *p, int cpu)
{
- return (*(uint64_t *)((char *)p + sizeof(struct pcpu) * cpu));
+ return (*(uint64_t *)((char *)p + UMA_PCPU_ZONE_SIZE * cpu));
}
static inline uint64_t
@@ -134,7 +134,7 @@
counter_u64_zero_one_cpu(void *arg)
{
- *((uint64_t *)((char *)arg + sizeof(struct pcpu) *
+ *((uint64_t *)((char *)arg + UMA_PCPU_ZONE_SIZE *
PCPU_GET(cpuid))) = 0;
}
Index: sys/riscv/include/counter.h
===================================================================
--- sys/riscv/include/counter.h
+++ sys/riscv/include/counter.h
@@ -46,7 +46,7 @@
counter_u64_read_one(uint64_t *p, int cpu)
{
- return (*(uint64_t *)((char *)p + sizeof(struct pcpu) * cpu));
+ return (*(uint64_t *)((char *)p + UMA_PCPU_ZONE_SIZE * cpu));
}
static inline uint64_t
@@ -67,7 +67,7 @@
counter_u64_zero_one_cpu(void *arg)
{
- *((uint64_t *)((char *)arg + sizeof(struct pcpu) *
+ *((uint64_t *)((char *)arg + UMA_PCPU_ZONE_SIZE *
PCPU_GET(cpuid))) = 0;
}
Index: sys/sparc64/include/counter.h
===================================================================
--- sys/sparc64/include/counter.h
+++ sys/sparc64/include/counter.h
@@ -47,7 +47,7 @@
counter_u64_read_one(uint64_t *p, int cpu)
{
- return (*(uint64_t *)((char *)p + sizeof(struct pcpu) * cpu));
+ return (*(uint64_t *)((char *)p + UMA_PCPU_ZONE_SIZE * cpu));
}
static inline uint64_t
@@ -68,7 +68,7 @@
counter_u64_zero_one_cpu(void *arg)
{
- *((uint64_t *)((char *)arg + sizeof(struct pcpu) *
+ *((uint64_t *)((char *)arg + UMA_PCPU_ZONE_SIZE *
PCPU_GET(cpuid))) = 0;
}
Index: sys/sys/pcpu.h
===================================================================
--- sys/sys/pcpu.h
+++ sys/sys/pcpu.h
@@ -208,14 +208,16 @@
zpcpu_get(void *base)
{
- return ((char *)(base) + sizeof(struct pcpu) * curcpu);
+ /* UMA_PCPU_ZONE_SIZE == PAGE_SIZE */
+ return ((char *)(base) + PAGE_SIZE * curcpu);
}
static inline void *
zpcpu_get_cpu(void *base, int cpu)
{
- return ((char *)(base) + sizeof(struct pcpu) * cpu);
+ /* UMA_PCPU_ZONE_SIZE == PAGE_SIZE */
+ return ((char *)(base) + PAGE_SIZE * cpu);
}
/*
Index: sys/sys/pcpu_quota.h
===================================================================
--- /dev/null
+++ sys/sys/pcpu_quota.h
@@ -0,0 +1,42 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2018, Matthew Macy <mmacy@freebsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _SYS_PCPU_QUOTA_H_
+#define _SYS_PCPU_QUOTA_H_
+struct pcpu_quota;
+
+struct pcpu_quota *pcpu_quota_alloc(uintptr_t *global, uintptr_t pcpu_slop,
+ int (*alloc)(void *, uintptr_t, uintptr_t *), void *context, int flags);
+
+void pcpu_quota_cache_set(struct pcpu_quota *pq, int enable);
+void pcpu_quota_free(struct pcpu_quota *pq);
+int pcpu_quota_incr(struct pcpu_quota *pq, uintptr_t incr);
+void pcpu_quota_decr(struct pcpu_quota *pq, uintptr_t decr);
+
+#endif
Index: sys/sys/pcpu_refcount.h
===================================================================
--- /dev/null
+++ sys/sys/pcpu_refcount.h
@@ -0,0 +1,55 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2018, Matthew Macy <mmacy@freebsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _SYS_PCPU_REFCOUNT_H_
+#define _SYS_PCPU_REFCOUNT_H_
+
+
+struct pcpu_ref;
+typedef struct pcpu_ref *pcpu_ref_t;
+
+pcpu_ref_t pcpu_ref_alloc(int flags);
+void pcpu_ref_free(pcpu_ref_t pr);
+void pcpu_ref_incr(pcpu_ref_t pr, int incr);
+int pcpu_ref_decr(pcpu_ref_t pr, int decr);
+void pcpu_ref_kill(pcpu_ref_t pr);
+
+static inline void
+pcpu_ref_acquire(pcpu_ref_t pr)
+{
+ pcpu_ref_incr(pr, 1);
+}
+
+static inline int
+pcpu_ref_release(pcpu_ref_t pr)
+{
+ return (pcpu_ref_decr(pr, 1));
+}
+
+#endif
Index: sys/sys/resourcevar.h
===================================================================
--- sys/sys/resourcevar.h
+++ sys/sys/resourcevar.h
@@ -97,8 +97,8 @@
*/
struct uidinfo {
LIST_ENTRY(uidinfo) ui_hash; /* (c) hash chain of uidinfos */
- struct mtx ui_vmsize_mtx;
- vm_ooffset_t ui_vmsize; /* (d) swap reservation by uid */
+ vm_offset_t ui_vmsize; /* (d) swap reservation by uid */
+ struct pcpu_quota *ui_vmsize_pq;
long ui_sbsize; /* (b) socket buffer space consumed */
long ui_proccnt; /* (b) number of processes */
long ui_ptscnt; /* (b) number of pseudo-terminals */
Index: sys/sys/ucred.h
===================================================================
--- sys/sys/ucred.h
+++ sys/sys/ucred.h
@@ -48,8 +48,9 @@
* priv(9) interface should be used to check for privilege.
*/
#if defined(_KERNEL) || defined(_WANT_UCRED)
+#include <sys/pcpu_refcount.h>
struct ucred {
- u_int cr_ref; /* reference count */
+ pcpu_ref_t cr_pref; /* pcpu reference count */
#define cr_startcopy cr_uid
uid_t cr_uid; /* effective user id */
uid_t cr_ruid; /* real user id */
@@ -78,6 +79,8 @@
* Flags for cr_flags.
*/
#define CRED_FLAG_CAPMODE 0x00000001 /* In capability mode. */
+#define CRED_FLAG_ONSTACK 0x00000002 /* Stack allocated */
+#define CRED_FLAG_OWNED 0x00000004 /* Has an owner */
/*
* This is the external representation of struct ucred.
@@ -111,6 +114,7 @@
void proc_set_cred_init(struct proc *p, struct ucred *cr);
struct ucred *proc_set_cred(struct proc *p, struct ucred *cr);
void crfree(struct ucred *cr);
+void crdrop_owner(struct ucred *cr);
struct ucred *crget(void);
struct ucred *crhold(struct ucred *cr);
void cru2x(struct ucred *cr, struct xucred *xcr);
Index: sys/ufs/ufs/ufs_vnops.c
===================================================================
--- sys/ufs/ufs/ufs_vnops.c
+++ sys/ufs/ufs/ufs_vnops.c
@@ -1846,7 +1846,7 @@
* XXX This seems to never be accessed out of
* our context so a stack variable is ok.
*/
- refcount_init(&ucred.cr_ref, 1);
+ ucred.cr_flags = CRED_FLAG_ONSTACK;
ucred.cr_uid = ip->i_uid;
ucred.cr_ngroups = 1;
ucred.cr_groups = &ucred_group;
@@ -2610,7 +2610,7 @@
* XXX This seems to never be accessed out of our
* context so a stack variable is ok.
*/
- refcount_init(&ucred.cr_ref, 1);
+ ucred.cr_flags = CRED_FLAG_ONSTACK;
ucred.cr_uid = ip->i_uid;
ucred.cr_ngroups = 1;
ucred.cr_groups = &ucred_group;
Index: sys/vm/swap_pager.h
===================================================================
--- sys/vm/swap_pager.h
+++ sys/vm/swap_pager.h
@@ -76,6 +76,8 @@
#ifdef _KERNEL
extern int swap_pager_avail;
+extern vm_offset_t vmsize_max_pcpu_slop;
+extern vm_offset_t vmsize_max_slop;
struct xswdev;
int swap_dev_info(int name, struct xswdev *xs, char *devname, size_t len);
@@ -86,6 +88,7 @@
int swap_pager_nswapdev(void);
int swap_pager_reserve(vm_object_t, vm_pindex_t, vm_size_t);
void swap_pager_status(int *total, int *used);
+int swap_pager_vmsize_alloc(void *arg, vm_offset_t incr, vm_offset_t *slop);
void swapoff_all(void);
#endif /* _KERNEL */
Index: sys/vm/swap_pager.c
===================================================================
--- sys/vm/swap_pager.c
+++ sys/vm/swap_pager.c
@@ -88,6 +88,7 @@
#include <sys/namei.h>
#include <sys/vnode.h>
#include <sys/malloc.h>
+#include <sys/pcpu_quota.h>
#include <sys/pctrie.h>
#include <sys/racct.h>
#include <sys/resource.h>
@@ -98,6 +99,7 @@
#include <sys/sysproto.h>
#include <sys/blist.h>
#include <sys/lock.h>
+#include <sys/smp.h>
#include <sys/sx.h>
#include <sys/vmmeter.h>
@@ -154,9 +156,22 @@
static vm_ooffset_t swap_total;
SYSCTL_QUAD(_vm, OID_AUTO, swap_total, CTLFLAG_RD, &swap_total, 0,
"Total amount of available swap storage.");
-static vm_ooffset_t swap_reserved;
+static vm_offset_t swap_max_pcpu_slop;
+static vm_offset_t swap_max_slop;
+vm_offset_t vmsize_max_pcpu_slop;
+vm_offset_t vmsize_max_slop;
+static vm_offset_t swap_reserved;
+#ifdef __LP64__
+SYSCTL_QUAD(_vm, OID_AUTO, swap_max_slop, CTLFLAG_RD, &swap_max_slop, 0,
+ "maximum amount of slop in swap accounting.");
SYSCTL_QUAD(_vm, OID_AUTO, swap_reserved, CTLFLAG_RD, &swap_reserved, 0,
"Amount of swap storage needed to back all allocated anonymous memory.");
+#else
+SYSCTL_INT(_vm, OID_AUTO, swap_max_slop, CTLFLAG_RD, &swap_max_slop, 0,
+ "maximum amount of slop in swap accounting.");
+SYSCTL_INT(_vm, OID_AUTO, swap_reserved, CTLFLAG_RD, &swap_reserved, 0,
+ "Amount of swap storage needed to back all allocated anonymous memory.");
+#endif
static int overcommit = 0;
SYSCTL_INT(_vm, VM_OVERCOMMIT, overcommit, CTLFLAG_RW, &overcommit, 0,
"Configure virtual memory overcommit behavior. See tuning(7) "
@@ -173,18 +188,149 @@
#define SWAP_RESERVE_RLIMIT_ON (1 << 1)
#define SWAP_RESERVE_ALLOW_NONWIRED (1 << 2)
+#ifdef __LP64__
+#define atomic_fetchadd_uintptr atomic_fetchadd_long
+#define atomic_subtract_uintptr atomic_subtract_long
+#define atomic_add_uintptr atomic_add_long
+#else
+#define atomic_fetchadd_uintptr atomic_fetchadd_int
+#define atomic_subtract_uintptr atomic_subtract_int
+#define atomic_add_uintptr atomic_add_int
+#endif
+
+struct pcpu_quota *swap_reserve_pq;
int
-swap_reserve(vm_ooffset_t incr)
+swap_reserve(vm_offset_t incr)
{
return (swap_reserve_by_cred(incr, curthread->td_ucred));
}
+static int
+swap_alloc_can_cache(void)
+{
+ vm_offset_t s;
+
+ if ((overcommit & SWAP_RESERVE_FORCE_ON) == 0)
+ return (1);
+ if (overcommit & SWAP_RESERVE_ALLOW_NONWIRED) {
+ s = vm_cnt.v_page_count - vm_cnt.v_free_reserved -
+ vm_wire_count();
+ s *= PAGE_SIZE;
+ } else
+ s = 0;
+ s += swap_total;
+ if (__predict_true(2*swap_max_slop < swap_total - swap_reserved))
+ return (1);
+
+ return (0);
+}
+
+static int
+swap_alloc_slow(void *arg __unused, uintptr_t incr, uintptr_t *slop)
+{
+ vm_offset_t r, s, new, adj;
+ int res, can_cache;
+
+ can_cache = swap_alloc_can_cache();
+ adj = (swap_max_pcpu_slop>>1);
+ MPASS(*slop == 0);
+ if (can_cache) {
+ incr += adj;
+ } else if (__predict_false(swap_max_slop > swap_total - swap_reserved))
+ pcpu_quota_cache_set(swap_reserve_pq, 0);
+
+ res = 0;
+ new = atomic_fetchadd_uintptr(&swap_reserved, incr);
+ r = new + incr;
+ if (overcommit & SWAP_RESERVE_ALLOW_NONWIRED) {
+ s = vm_cnt.v_page_count - vm_cnt.v_free_reserved -
+ vm_wire_count();
+ s *= PAGE_SIZE;
+ } else
+ s = 0;
+ s += swap_total;
+ if ((overcommit & SWAP_RESERVE_FORCE_ON) == 0 || r <= s ||
+ priv_check(curthread, PRIV_VM_SWAP_NOQUOTA) == 0) {
+ res = 1;
+ *slop = can_cache*adj;
+ } else
+ atomic_subtract_uintptr(&swap_reserved, incr);
+
+ return (res);
+}
+
+static void
+swap_alloc_init(void *arg __unused)
+{
+ uint64_t slop_pages_pcpu;
+
+ slop_pages_pcpu = physmem / (8*mp_ncpus);
+ swap_max_pcpu_slop = slop_pages_pcpu*PAGE_SIZE;
+ swap_max_slop = swap_max_pcpu_slop*mp_ncpus;
+ vmsize_max_pcpu_slop = (slop_pages_pcpu >> 1)*PAGE_SIZE;
+ vmsize_max_slop = vmsize_max_pcpu_slop*mp_ncpus;
+ swap_reserve_pq = pcpu_quota_alloc(&swap_reserved, swap_max_pcpu_slop,
+ swap_alloc_slow, NULL, M_WAITOK);
+}
+SYSINIT(swap_alloc_init, SI_SUB_VM_CONF, SI_ORDER_ANY, swap_alloc_init, NULL);
+
+static int
+swap_alloc(vm_offset_t incr)
+{
+ return (pcpu_quota_incr(swap_reserve_pq, incr));
+}
+
+static void
+swap_free(vm_offset_t decr)
+{
+ return (pcpu_quota_decr(swap_reserve_pq, decr));
+}
+
int
-swap_reserve_by_cred(vm_ooffset_t incr, struct ucred *cred)
+swap_pager_vmsize_alloc(void *arg, uintptr_t incr, uintptr_t *slop)
{
- vm_ooffset_t r, s;
- int res, error;
+ struct uidinfo *uip;
+ int can_cache;
+ vm_offset_t new, adj;
+
+ uip = arg;
+ MPASS(*slop == 0);
+ adj = (vmsize_max_pcpu_slop >> 1);
+ if ((overcommit & SWAP_RESERVE_RLIMIT_ON) == 0) {
+ *slop = adj;
+ incr += adj;
+ atomic_add_uintptr(&uip->ui_vmsize, incr);
+ return (1);
+ }
+
+ if (__predict_false((overcommit & SWAP_RESERVE_RLIMIT_ON) &&
+ uip->ui_vmsize + swap_max_slop > lim_cur(curthread, RLIMIT_SWAP)))
+ pcpu_quota_cache_set(uip->ui_vmsize_pq, 0);
+ if ((overcommit & SWAP_RESERVE_RLIMIT_ON) != 0 &&
+ uip->ui_vmsize + incr > lim_cur(curthread, RLIMIT_SWAP) &&
+ priv_check(curthread, PRIV_VM_SWAP_NORLIMIT))
+ return (0);
+ can_cache = 0;
+ if ((overcommit & SWAP_RESERVE_RLIMIT_ON) == 0 ||
+ uip->ui_vmsize + 2*swap_max_slop < lim_cur(curthread, RLIMIT_SWAP))
+ can_cache = 1;
+
+ incr += can_cache*adj;
+ new = atomic_fetchadd_uintptr(&uip->ui_vmsize, incr);
+ if ((overcommit & SWAP_RESERVE_RLIMIT_ON) != 0 &&
+ new + incr > lim_cur(curthread, RLIMIT_SWAP)) {
+ atomic_subtract_uintptr(&uip->ui_vmsize, incr);
+ return (0);
+ }
+ *slop = can_cache*adj;
+ return (1);
+}
+
+int
+swap_reserve_by_cred(vm_offset_t incr, struct ucred *cred)
+{
+ int res;
static int curfail;
static struct timeval lastfail;
struct uidinfo *uip;
@@ -197,52 +343,26 @@
#ifdef RACCT
if (racct_enable) {
PROC_LOCK(curproc);
- error = racct_add(curproc, RACCT_SWAP, incr);
+ res = racct_add(curproc, RACCT_SWAP, incr);
PROC_UNLOCK(curproc);
- if (error != 0)
+ if (res != 0)
return (0);
}
#endif
- res = 0;
- mtx_lock(&sw_dev_mtx);
- r = swap_reserved + incr;
- if (overcommit & SWAP_RESERVE_ALLOW_NONWIRED) {
- s = vm_cnt.v_page_count - vm_cnt.v_free_reserved -
- vm_wire_count();
- s *= PAGE_SIZE;
- } else
- s = 0;
- s += swap_total;
- if ((overcommit & SWAP_RESERVE_FORCE_ON) == 0 || r <= s ||
- (error = priv_check(curthread, PRIV_VM_SWAP_NOQUOTA)) == 0) {
- res = 1;
- swap_reserved = r;
- }
- mtx_unlock(&sw_dev_mtx);
-
+ res = swap_alloc(incr);
if (res) {
- UIDINFO_VMSIZE_LOCK(uip);
- if ((overcommit & SWAP_RESERVE_RLIMIT_ON) != 0 &&
- uip->ui_vmsize + incr > lim_cur(curthread, RLIMIT_SWAP) &&
- priv_check(curthread, PRIV_VM_SWAP_NORLIMIT))
- res = 0;
- else
- uip->ui_vmsize += incr;
- UIDINFO_VMSIZE_UNLOCK(uip);
- if (!res) {
- mtx_lock(&sw_dev_mtx);
- swap_reserved -= incr;
- mtx_unlock(&sw_dev_mtx);
- }
+ res = pcpu_quota_incr(uip->ui_vmsize_pq, incr);
+ if (!res)
+ swap_free(incr);
}
if (!res && ppsratecheck(&lastfail, &curfail, 1)) {
printf("uid %d, pid %d: swap reservation for %jd bytes failed\n",
- uip->ui_uid, curproc->p_pid, incr);
+ uip->ui_uid, curproc->p_pid, (intmax_t)incr);
}
#ifdef RACCT
- if (!res) {
+ if (racct_enable && !res) {
PROC_LOCK(curproc);
racct_sub(curproc, RACCT_SWAP, incr);
PROC_UNLOCK(curproc);
@@ -253,41 +373,36 @@
}
void
-swap_reserve_force(vm_ooffset_t incr)
+swap_reserve_force(vm_offset_t incr)
{
struct uidinfo *uip;
- mtx_lock(&sw_dev_mtx);
- swap_reserved += incr;
- mtx_unlock(&sw_dev_mtx);
+ if (swap_alloc(incr) == 0)
+ atomic_add_uintptr(&swap_reserved, incr);
#ifdef RACCT
- PROC_LOCK(curproc);
- racct_add_force(curproc, RACCT_SWAP, incr);
- PROC_UNLOCK(curproc);
+ if (racct_enable) {
+ PROC_LOCK(curproc);
+ racct_add_force(curproc, RACCT_SWAP, incr);
+ PROC_UNLOCK(curproc);
+ }
#endif
uip = curthread->td_ucred->cr_ruidinfo;
- PROC_LOCK(curproc);
- UIDINFO_VMSIZE_LOCK(uip);
- uip->ui_vmsize += incr;
- UIDINFO_VMSIZE_UNLOCK(uip);
- PROC_UNLOCK(curproc);
+ atomic_add_uintptr(&uip->ui_vmsize, incr);
}
void
-swap_release(vm_ooffset_t decr)
+swap_release(vm_offset_t decr)
{
struct ucred *cred;
- PROC_LOCK(curproc);
cred = curthread->td_ucred;
swap_release_by_cred(decr, cred);
- PROC_UNLOCK(curproc);
}
void
-swap_release_by_cred(vm_ooffset_t decr, struct ucred *cred)
+swap_release_by_cred(vm_offset_t decr, struct ucred *cred)
{
struct uidinfo *uip;
@@ -296,19 +411,15 @@
if (decr & PAGE_MASK)
panic("swap_release: & PAGE_MASK");
- mtx_lock(&sw_dev_mtx);
if (swap_reserved < decr)
panic("swap_reserved < decr");
- swap_reserved -= decr;
- mtx_unlock(&sw_dev_mtx);
+ swap_free(decr);
- UIDINFO_VMSIZE_LOCK(uip);
if (uip->ui_vmsize < decr)
printf("negative vmsize for uid = %d\n", uip->ui_uid);
- uip->ui_vmsize -= decr;
- UIDINFO_VMSIZE_UNLOCK(uip);
-
- racct_sub_cred(cred, RACCT_SWAP, decr);
+ pcpu_quota_decr(uip->ui_vmsize_pq, decr);
+ if (racct_enable)
+ racct_sub_cred(cred, RACCT_SWAP, decr);
}
#define SWM_POP 0x01 /* pop out */
Index: sys/vm/uma.h
===================================================================
--- sys/vm/uma.h
+++ sys/vm/uma.h
@@ -44,6 +44,8 @@
/* User visible parameters */
#define UMA_SMALLEST_UNIT (PAGE_SIZE / 256) /* Smallest item allocated */
+#define UMA_PCPU_ZONE_SIZE PAGE_SIZE
+
/* Types and type defs */
struct uma_zone;
@@ -279,8 +281,7 @@
* mini-dumps.
*/
#define UMA_ZONE_PCPU 0x8000 /*
- * Allocates mp_maxid + 1 slabs sized to
- * sizeof(struct pcpu).
+ * Allocates mp_maxid + 1 slabs of PAGE_SIZE
*/
#define UMA_ZONE_NUMA 0x10000 /*
* NUMA aware Zone. Implements a best
Index: sys/vm/uma_core.c
===================================================================
--- sys/vm/uma_core.c
+++ sys/vm/uma_core.c
@@ -229,8 +229,10 @@
static void *noobj_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
static void *page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
+static void *pcpu_page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
static void *startup_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
static void page_free(void *, vm_size_t, uint8_t);
+static void pcpu_page_free(void *, vm_size_t, uint8_t);
static uma_slab_t keg_alloc_slab(uma_keg_t, uma_zone_t, int, int);
static void cache_drain(uma_zone_t);
static void bucket_drain(uma_zone_t, uma_bucket_t);
@@ -1172,6 +1174,54 @@
return (p);
}
+static void *
+pcpu_page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
+ int wait)
+{
+ TAILQ_HEAD(, vm_page) alloctail;
+ vm_offset_t addr, zkva;
+ struct pcpu *pc;
+ int cpu;
+ vm_page_t p, p_next;
+
+ TAILQ_INIT(&alloctail);
+ MPASS(bytes == (mp_maxid+1)*PAGE_SIZE);
+ *pflag = UMA_SLAB_KERNEL;
+
+ for (cpu = 0; cpu <= mp_maxid; cpu++) {
+ if (CPU_ABSENT(cpu)) {
+ p = vm_page_alloc_domain(NULL, 0, 0, VM_ALLOC_INTERRUPT |
+ VM_ALLOC_WIRED | VM_ALLOC_NOOBJ |
+ ((wait & M_WAITOK) != 0 ? VM_ALLOC_WAITOK :
+ VM_ALLOC_NOWAIT));
+
+ } else {
+ pc = pcpu_find(cpu);
+ p = vm_page_alloc_domain(NULL, 0, pc->pc_domain, VM_ALLOC_INTERRUPT |
+ VM_ALLOC_WIRED | VM_ALLOC_NOOBJ |
+ ((wait & M_WAITOK) != 0 ? VM_ALLOC_WAITOK :
+ VM_ALLOC_NOWAIT));
+ }
+ if (__predict_false(p == NULL))
+ goto fail;
+ TAILQ_INSERT_TAIL(&alloctail, p, listq);
+ }
+ if ((addr = kva_alloc(bytes)) == 0)
+ goto fail;
+ zkva = addr;
+ TAILQ_FOREACH(p, &alloctail, listq) {
+ pmap_qenter(zkva, &p, 1);
+ zkva += PAGE_SIZE;
+ }
+ return ((void*)addr);
+ fail:
+ TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) {
+ vm_page_unwire(p, PQ_NONE);
+ vm_page_free(p);
+ }
+ return (NULL);
+}
+
/*
* Allocates a number of pages from within an object
*
@@ -1257,6 +1307,37 @@
kmem_free(vmem, (vm_offset_t)mem, size);
}
+/*
+ * Frees pcpu zone allocations
+ *
+ * Arguments:
+ * mem A pointer to the memory to be freed
+ * size The size of the memory being freed
+ * flags The original p->us_flags field
+ *
+ * Returns:
+ * Nothing
+ */
+static void
+pcpu_page_free(void *mem, vm_size_t size, uint8_t flags)
+{
+ vm_offset_t sva, curva;
+ vm_paddr_t paddr;
+ vm_page_t m;
+
+ MPASS(size == (mp_maxid+1)*PAGE_SIZE);
+ sva = (vm_offset_t)mem;
+ for (curva = sva; curva < sva + size; curva += PAGE_SIZE) {
+ paddr = pmap_kextract(curva);
+ m = PHYS_TO_VM_PAGE(paddr);
+ vm_page_unwire(m, PQ_NONE);
+ vm_page_free(m);
+ }
+ pmap_qremove(sva, size >> PAGE_SHIFT);
+ kva_free(sva, size);
+}
+
+
/*
* Zero fill initializer
*
@@ -1290,9 +1371,8 @@
if (keg->uk_flags & UMA_ZONE_PCPU) {
u_int ncpus = (mp_maxid + 1) ? (mp_maxid + 1) : MAXCPU;
- slabsize = sizeof(struct pcpu);
- keg->uk_ppera = howmany(ncpus * sizeof(struct pcpu),
- PAGE_SIZE);
+ slabsize = PAGE_SIZE;
+ keg->uk_ppera = ncpus;
} else {
slabsize = UMA_SLAB_SIZE;
keg->uk_ppera = 1;
@@ -1311,7 +1391,7 @@
keg->uk_rsize = rsize;
KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0 ||
- keg->uk_rsize < sizeof(struct pcpu),
+ keg->uk_rsize < UMA_PCPU_ZONE_SIZE,
("%s: size %u too large", __func__, keg->uk_rsize));
if (keg->uk_flags & UMA_ZONE_OFFPAGE)
@@ -1529,6 +1609,8 @@
else if (keg->uk_ppera == 1)
keg->uk_allocf = uma_small_alloc;
#endif
+ else if (keg->uk_flags & UMA_ZONE_PCPU)
+ keg->uk_allocf = pcpu_page_alloc;
else
keg->uk_allocf = page_alloc;
#ifdef UMA_MD_SMALL_ALLOC
@@ -1536,6 +1618,9 @@
keg->uk_freef = uma_small_free;
else
#endif
+ if (keg->uk_flags & UMA_ZONE_PCPU)
+ keg->uk_freef = pcpu_page_free;
+ else
keg->uk_freef = page_free;
/*
Index: sys/vm/uma_int.h
===================================================================
--- sys/vm/uma_int.h
+++ sys/vm/uma_int.h
@@ -222,9 +222,8 @@
*
*/
struct uma_keg {
- struct mtx_padalign uk_lock; /* Lock for the keg */
+ struct mtx uk_lock; /* Lock for the keg */
struct uma_hash uk_hash;
-
LIST_HEAD(,uma_zone) uk_zones; /* Keg's zones */
uint32_t uk_cursor; /* Domain alloc cursor. */
@@ -315,41 +314,49 @@
*
*/
struct uma_zone {
- struct mtx_padalign uz_lock; /* Lock for the zone */
- struct mtx_padalign *uz_lockptr;
- const char *uz_name; /* Text name of the zone */
-
- LIST_ENTRY(uma_zone) uz_link; /* List of all zones in keg */
+ /* Offset 0, used in alloc/free fast/medium fast path and const. */
+ struct mtx *uz_lockptr;
+ const char *uz_name; /* Text name of the zone */
struct uma_zone_domain *uz_domain; /* per-domain buckets */
-
- LIST_HEAD(,uma_klink) uz_kegs; /* List of kegs. */
- struct uma_klink uz_klink; /* klink for first keg. */
-
- uma_slaballoc uz_slab; /* Allocate a slab from the backend. */
+ uint32_t uz_flags; /* Flags inherited from kegs */
+ uint32_t uz_size; /* Size inherited from kegs */
uma_ctor uz_ctor; /* Constructor for each allocation */
uma_dtor uz_dtor; /* Destructor */
uma_init uz_init; /* Initializer for each item */
uma_fini uz_fini; /* Finalizer for each item. */
+
+ /* Offset 64, used in bucket replenish. */
uma_import uz_import; /* Import new memory to cache. */
uma_release uz_release; /* Release memory from cache. */
void *uz_arg; /* Import/release argument. */
-
- uint32_t uz_flags; /* Flags inherited from kegs */
- uint32_t uz_size; /* Size inherited from kegs */
-
- volatile u_long uz_allocs UMA_ALIGN; /* Total number of allocations */
- volatile u_long uz_fails; /* Total number of alloc failures */
- volatile u_long uz_frees; /* Total number of frees */
- uint64_t uz_sleeps; /* Total number of alloc sleeps */
+ uma_slaballoc uz_slab; /* Allocate a slab from the backend. */
uint16_t uz_count; /* Amount of items in full bucket */
uint16_t uz_count_min; /* Minimal amount of items there */
+ /* 32bit pad on 64bit. */
+ LIST_ENTRY(uma_zone) uz_link; /* List of all zones in keg */
+ LIST_HEAD(,uma_klink) uz_kegs; /* List of kegs. */
+ /* Offset 128 Rare. */
+ /*
+ * The lock is placed here to avoid adjacent line prefetcher
+ * in fast paths and to take up space near infrequently accessed
+ * members to reduce alignment overhead.
+ */
+ struct mtx uz_lock; /* Lock for the zone */
+ struct uma_klink uz_klink; /* klink for first keg. */
/* The next two fields are used to print a rate-limited warnings. */
const char *uz_warning; /* Warning to print on failure */
struct timeval uz_ratecheck; /* Warnings rate-limiting */
-
struct task uz_maxaction; /* Task to run when at limit */
+ /* 16 bytes of pad. */
+
+ /* Offset 256, atomic stats. */
+ volatile u_long uz_allocs UMA_ALIGN; /* Total number of allocations */
+ volatile u_long uz_fails; /* Total number of alloc failures */
+ volatile u_long uz_frees; /* Total number of frees */
+ uint64_t uz_sleeps; /* Total number of alloc sleeps */
+
/*
* This HAS to be the last item because we adjust the zone size
* based on NCPU and then allocate the space for the zones.
Index: sys/vm/vm.h
===================================================================
--- sys/vm/vm.h
+++ sys/vm/vm.h
@@ -151,11 +151,11 @@
extern int vm_ndomains;
struct ucred;
-int swap_reserve(vm_ooffset_t incr);
-int swap_reserve_by_cred(vm_ooffset_t incr, struct ucred *cred);
-void swap_reserve_force(vm_ooffset_t incr);
-void swap_release(vm_ooffset_t decr);
-void swap_release_by_cred(vm_ooffset_t decr, struct ucred *cred);
+int swap_reserve(vm_offset_t incr);
+int swap_reserve_by_cred(vm_offset_t incr, struct ucred *cred);
+void swap_reserve_force(vm_offset_t incr);
+void swap_release(vm_offset_t decr);
+void swap_release_by_cred(vm_offset_t decr, struct ucred *cred);
void swapper(void);
#endif /* VM_H */
Index: sys/x86/acpica/srat.c
===================================================================
--- sys/x86/acpica/srat.c
+++ sys/x86/acpica/srat.c
@@ -517,12 +517,15 @@
static void
srat_set_cpus(void *dummy)
{
+#ifdef NUMA
struct cpu_info *cpu;
struct pcpu *pc;
u_int i;
+#endif
if (srat_physaddr == 0)
return;
+#ifdef NUMA
for (i = 0; i < MAXCPU; i++) {
if (CPU_ABSENT(i))
continue;
@@ -538,7 +541,7 @@
printf("SRAT: CPU %u has memory domain %d\n", i,
cpu->domain);
}
-
+#endif
/* Last usage of the cpus array, unmap it. */
pmap_unmapbios((vm_offset_t)cpus, sizeof(*cpus) * (max_apic_id + 1));
cpus = NULL;
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Thu, Mar 12, 2:16 AM (14 h, 59 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
29560670
Default Alt Text
D15975.diff (50 KB)
Attached To
Mode
D15975: eliminate global serialization points in swap reserve & mmap
Attached
Detach File
Event Timeline
Log In to Comment