Page MenuHomeFreeBSD

D5080.id12953.diff
No OneTemporary

D5080.id12953.diff

Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
+++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
@@ -132,6 +132,7 @@
#include <sys/multilist.h>
#ifdef _KERNEL
#include <sys/dnlc.h>
+#include <sys/racct.h>
#endif
#include <sys/callb.h>
#include <sys/kstat.h>
@@ -4503,6 +4504,18 @@
demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
data, metadata, misses);
#ifdef _KERNEL
+#ifdef RACCT
+ if (racct_enable) {
+#if 0
+ printf("%s: adding %jd bytes for %d (%s)\n",
+ __func__, (uintmax_t)size, curproc->p_pid, curproc->p_comm);
+#endif
+ PROC_LOCK(curproc);
+ racct_add_force(curproc, RACCT_READBPS, size);
+ racct_add_force(curproc, RACCT_READIOPS, 1);
+ PROC_UNLOCK(curproc);
+ }
+#endif /* RACCT */
curthread->td_ru.ru_inblock++;
#endif
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
+++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
@@ -47,6 +47,7 @@
#include <sys/sa.h>
#include <sys/zfeature.h>
#ifdef _KERNEL
+#include <sys/racct.h>
#include <sys/vm.h>
#include <sys/zfs_znode.h>
#endif
@@ -427,6 +428,15 @@
}
dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
+#if defined(_KERNEL) && defined(RACCT)
+ if (racct_enable && !read) {
+ PROC_LOCK(curproc);
+ racct_add_force(curproc, RACCT_WRITEBPS, length);
+ racct_add_force(curproc, RACCT_WRITEIOPS, nblks);
+ PROC_UNLOCK(curproc);
+ }
+#endif
+
zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL);
blkid = dbuf_whichblock(dn, 0, offset);
for (i = 0; i < nblks; i++) {
@@ -1422,7 +1432,15 @@
DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA) {
#ifdef _KERNEL
curthread->td_ru.ru_oublock++;
-#endif
+#ifdef RACCT
+ if (racct_enable) {
+ PROC_LOCK(curproc);
+ racct_add_force(curproc, RACCT_WRITEBPS, blksz);
+ racct_add_force(curproc, RACCT_WRITEIOPS, 1);
+ PROC_UNLOCK(curproc);
+ }
+#endif /* RACCT */
+#endif /* _KERNEL */
dbuf_assign_arcbuf(db, buf, tx);
dbuf_rele(db, FTAG);
} else {
Index: sys/fs/ext2fs/ext2_bmap.c
===================================================================
--- sys/fs/ext2fs/ext2_bmap.c
+++ sys/fs/ext2fs/ext2_bmap.c
@@ -42,6 +42,7 @@
#include <sys/proc.h>
#include <sys/vnode.h>
#include <sys/mount.h>
+#include <sys/racct.h>
#include <sys/resourcevar.h>
#include <sys/stat.h>
@@ -243,6 +244,13 @@
vfs_busy_pages(bp, 0);
bp->b_iooffset = dbtob(bp->b_blkno);
bstrategy(bp);
+#ifdef RACCT
+ if (racct_enable) {
+ PROC_LOCK(curproc);
+ racct_add_buf(curproc, bp, 0);
+ PROC_UNLOCK(curproc);
+ }
+#endif
curthread->td_ru.ru_inblock++;
error = bufwait(bp);
if (error) {
Index: sys/kern/kern_physio.c
===================================================================
--- sys/kern/kern_physio.c
+++ sys/kern/kern_physio.c
@@ -27,6 +27,7 @@
#include <sys/conf.h>
#include <sys/malloc.h>
#include <sys/proc.h>
+#include <sys/racct.h>
#include <sys/uio.h>
#include <geom/geom.h>
@@ -109,6 +110,22 @@
prot |= VM_PROT_WRITE; /* Less backwards than it looks */
error = 0;
for (i = 0; i < uio->uio_iovcnt; i++) {
+#ifdef RACCT
+ if (racct_enable) {
+ PROC_LOCK(curproc);
+ if (uio->uio_rw == UIO_READ) {
+ racct_add_force(curproc, RACCT_READBPS,
+ uio->uio_iov[i].iov_len);
+ racct_add_force(curproc, RACCT_READIOPS, 1);
+ } else {
+ racct_add_force(curproc, RACCT_WRITEBPS,
+ uio->uio_iov[i].iov_len);
+ racct_add_force(curproc, RACCT_WRITEIOPS, 1);
+ }
+ PROC_UNLOCK(curproc);
+ }
+#endif /* RACCT */
+
while (uio->uio_iov[i].iov_len) {
bzero(bp, sizeof(*bp));
if (uio->uio_rw == UIO_READ) {
Index: sys/kern/kern_racct.c
===================================================================
--- sys/kern/kern_racct.c
+++ sys/kern/kern_racct.c
@@ -35,6 +35,7 @@
#include "opt_sched.h"
#include <sys/param.h>
+#include <sys/buf.h>
#include <sys/systm.h>
#include <sys/eventhandler.h>
#include <sys/jail.h>
@@ -66,21 +67,25 @@
FEATURE(racct, "Resource Accounting");
-/*
- * Do not block processes that have their %cpu usage <= pcpu_threshold.
- */
-static int pcpu_threshold = 1;
#ifdef RACCT_DEFAULT_TO_DISABLED
int racct_enable = 0;
#else
int racct_enable = 1;
#endif
+/*
+ * Do not block processes that have their %cpu usage <= pcpu_threshold.
+ */
+static int pcpu_threshold = 1;
+int racct_pctcpu_throttle_not_deny = 0;
SYSCTL_NODE(_kern, OID_AUTO, racct, CTLFLAG_RW, 0, "Resource Accounting");
SYSCTL_UINT(_kern_racct, OID_AUTO, enable, CTLFLAG_RDTUN, &racct_enable,
0, "Enable RACCT/RCTL");
SYSCTL_UINT(_kern_racct, OID_AUTO, pcpu_threshold, CTLFLAG_RW, &pcpu_threshold,
- 0, "Processes with higher %cpu usage than this value can be throttled.");
+ 0, "\"pcpu:deny\" rules don't affect processes with %cpu below this value");
+SYSCTL_UINT(_kern_racct, OID_AUTO, pcpu_throttle_not_deny, CTLFLAG_RD,
+ &racct_pctcpu_throttle_not_deny,
+ 0, "Use \"pcpu:throttle\" rules instead of \"pcpu:deny\"");
/*
* How many seconds it takes to use the scheduler %cpu calculations. When a
@@ -90,7 +95,7 @@
*/
#define RACCT_PCPU_SECS 3
-static struct mtx racct_lock;
+struct mtx racct_lock;
MTX_SYSINIT(racct_lock, &racct_lock, "racct lock", MTX_DEF);
static uma_zone_t racct_zone;
@@ -171,7 +176,16 @@
[RACCT_WALLCLOCK] =
RACCT_IN_MILLIONS,
[RACCT_PCTCPU] =
- RACCT_DECAYING | RACCT_DENIABLE | RACCT_IN_MILLIONS };
+ RACCT_RECLAIMABLE | RACCT_DECAYING | RACCT_DENIABLE |
+ RACCT_IN_MILLIONS,
+ [RACCT_READBPS] =
+ RACCT_DECAYING,
+ [RACCT_WRITEBPS] =
+ RACCT_DECAYING,
+ [RACCT_READIOPS] =
+ RACCT_DECAYING,
+ [RACCT_WRITEIOPS] =
+ RACCT_DECAYING };
static const fixpt_t RACCT_DECAY_FACTOR = 0.3 * FSCALE;
@@ -473,6 +487,8 @@
for (i = 0; i <= RACCT_MAX; i++) {
if (RACCT_IS_SLOPPY(i))
continue;
+ if (RACCT_IS_DECAYING(i))
+ continue;
if (!RACCT_IS_RECLAIMABLE(i))
continue;
KASSERT(racct->r_resources[i] == 0,
@@ -498,8 +514,8 @@
/*
* Increase consumption of 'resource' by 'amount' for 'racct',
- * but not its parents. Differently from other cases, 'amount' here
- * may be less than zero.
+ * but not its parents. Differently from other cases, 'amount'
+ * here may be less than zero.
*/
static void
racct_adjust_resource(struct racct *racct, int resource,
@@ -526,9 +542,10 @@
* returns for a thread more than 100% cpu usage. So we set a sane
* boundary here to 100% * the maxumum number of CPUs.
*/
- if ((resource == RACCT_PCTCPU) &&
- (racct->r_resources[RACCT_PCTCPU] > 100 * 1000000 * (int64_t)MAXCPU))
+ if (!racct_pctcpu_throttle_not_deny && resource == RACCT_PCTCPU &&
+ racct->r_resources[RACCT_PCTCPU] > 100 * 1000000 * (int64_t)MAXCPU) {
racct->r_resources[RACCT_PCTCPU] = 100 * 1000000 * (int64_t)MAXCPU;
+ }
}
static int
@@ -612,16 +629,11 @@
mtx_unlock(&racct_lock);
}
-/*
- * Increase allocation of 'resource' by 'amount' for process 'p'.
- * Doesn't check for limits and never fails.
- */
-void
-racct_add_force(struct proc *p, int resource, uint64_t amount)
+static void
+racct_add_force_locked(struct proc *p, int resource, uint64_t amount)
{
- if (!racct_enable)
- return;
+ ASSERT_RACCT_ENABLED();
SDT_PROBE3(racct, , rusage, add__force, p, resource, amount);
@@ -630,12 +642,48 @@
*/
PROC_LOCK_ASSERT(p, MA_OWNED);
- mtx_lock(&racct_lock);
+ /*
+ * Ignore the return value - we can't return error, but RCTL might
+ * eg. throttle the process.
+ */
+ (void)rctl_enforce(p, resource, amount);
+
racct_adjust_resource(p->p_racct, resource, amount);
racct_add_cred_locked(p->p_ucred, resource, amount);
+}
+
+/*
+ * Increase allocation of 'resource' by 'amount' for process 'p'.
+ * Checks for limits, but never fails.
+ */
+void
+racct_add_force(struct proc *p, int resource, uint64_t amount)
+{
+
+ if (!racct_enable)
+ return;
+
+ mtx_lock(&racct_lock);
+ racct_add_force_locked(p, resource, amount);
mtx_unlock(&racct_lock);
}
+void
+racct_add_buf(struct proc *p, const struct buf *bp, int is_write)
+{
+
+ ASSERT_RACCT_ENABLED();
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+
+ if (is_write) {
+ racct_add_force(curproc, RACCT_WRITEBPS, bp->b_bcount);
+ racct_add_force(curproc, RACCT_WRITEIOPS, 1);
+ } else {
+ racct_add_force(curproc, RACCT_READBPS, bp->b_bcount);
+ racct_add_force(curproc, RACCT_READIOPS, 1);
+ }
+}
+
static int
racct_set_locked(struct proc *p, int resource, uint64_t amount)
{
@@ -659,7 +707,7 @@
* The diffs may be negative.
*/
diff_proc = amount - old_amount;
- if (RACCT_IS_DECAYING(resource)) {
+ if (!racct_pctcpu_throttle_not_deny && RACCT_IS_DECAYING(resource)) {
/*
* Resources in per-credential racct containers may decay.
* If this is the case, we need to calculate the difference
@@ -688,7 +736,7 @@
racct_adjust_resource(p->p_racct, resource, diff_proc);
if (diff_cred > 0)
racct_add_cred_locked(p->p_ucred, resource, diff_cred);
- else if (diff_cred < 0)
+ else if (RACCT_IS_RECLAIMABLE(resource) && diff_cred < 0)
racct_sub_cred_locked(p->p_ucred, resource, -diff_cred);
return (0);
@@ -735,7 +783,7 @@
* The diffs may be negative.
*/
diff_proc = amount - old_amount;
- if (RACCT_IS_DECAYING(resource)) {
+ if (!racct_pctcpu_throttle_not_deny && RACCT_IS_DECAYING(resource)) {
/*
* Resources in per-credential racct containers may decay.
* If this is the case, we need to calculate the difference
@@ -750,7 +798,7 @@
racct_adjust_resource(p->p_racct, resource, diff_proc);
if (diff_cred > 0)
racct_add_cred_locked(p->p_ucred, resource, diff_cred);
- else if (diff_cred < 0)
+ else if (RACCT_IS_RECLAIMABLE(resource) && diff_cred < 0)
racct_sub_cred_locked(p->p_ucred, resource, -diff_cred);
}
@@ -850,7 +898,8 @@
(intmax_t)p->p_racct->r_resources[resource], p->p_comm, p->p_pid));
racct_adjust_resource(p->p_racct, resource, -amount);
- racct_sub_cred_locked(p->p_ucred, resource, amount);
+ if (RACCT_IS_RECLAIMABLE(resource))
+ racct_sub_cred_locked(p->p_ucred, resource, amount);
mtx_unlock(&racct_lock);
}
@@ -994,13 +1043,17 @@
#endif
microuptime(&wallclock);
timevalsub(&wallclock, &p->p_stats->p_start);
- if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) {
- pct_estimate = (1000000 * runtime * 100) /
- ((uint64_t)wallclock.tv_sec * 1000000 +
- wallclock.tv_usec);
- } else
- pct_estimate = 0;
- pct = racct_getpcpu(p, pct_estimate);
+ if (!racct_pctcpu_throttle_not_deny) {
+ if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) {
+ pct_estimate = (1000000 * runtime * 100) /
+ ((uint64_t)wallclock.tv_sec * 1000000 +
+ wallclock.tv_usec);
+ } else
+ pct_estimate = 0;
+ pct = racct_getpcpu(p, pct_estimate);
+ } else {
+ pct = (runtime - p->p_prev_runtime) * 100;
+ }
mtx_lock(&racct_lock);
racct_set_locked(p, RACCT_CPU, runtime);
@@ -1085,14 +1138,15 @@
mtx_unlock(&racct_lock);
}
-static void
-racct_proc_throttle(struct proc *p)
+void
+racct_proc_throttle(struct proc *p, int timeout)
{
struct thread *td;
#ifdef SMP
int cpuid;
#endif
+ KASSERT(timeout != 0, ("timeout %d", timeout));
ASSERT_RACCT_ENABLED();
PROC_LOCK_ASSERT(p, MA_OWNED);
@@ -1100,10 +1154,13 @@
* Do not block kernel processes. Also do not block processes with
* low %cpu utilization to improve interactivity.
*/
- if (((p->p_flag & (P_SYSTEM | P_KTHREAD)) != 0) ||
- (p->p_racct->r_resources[RACCT_PCTCPU] <= pcpu_threshold))
+ if ((p->p_flag & (P_SYSTEM | P_KTHREAD)) != 0)
+ return;
+
+ if (p->p_throttled < 0 || (timeout > 0 && p->p_throttled > timeout))
return;
- p->p_throttled = 1;
+
+ p->p_throttled = timeout;
FOREACH_THREAD_IN_PROC(p, td) {
thread_lock(td);
@@ -1144,30 +1201,37 @@
PROC_LOCK_ASSERT(p, MA_OWNED);
- if (p->p_throttled) {
+ if (p->p_throttled != 0) {
p->p_throttled = 0;
wakeup(p->p_racct);
}
}
static void
-racct_decay_resource(struct racct *racct, void * res, void* dummy)
+racct_decay_callback(struct racct *racct, void *res, void *dummy)
{
- int resource;
int64_t r_old, r_new;
ASSERT_RACCT_ENABLED();
mtx_assert(&racct_lock, MA_OWNED);
- resource = *(int *)res;
- r_old = racct->r_resources[resource];
+ rctl_throttle_decay(racct, RACCT_READBPS);
+ rctl_throttle_decay(racct, RACCT_WRITEBPS);
+ rctl_throttle_decay(racct, RACCT_READIOPS);
+ rctl_throttle_decay(racct, RACCT_WRITEIOPS);
+
+ if (racct_pctcpu_throttle_not_deny) {
+ rctl_throttle_decay(racct, RACCT_PCTCPU);
+ } else {
+ r_old = racct->r_resources[RACCT_PCTCPU];
+
+ /* If there is nothing to decay, just exit. */
+ if (r_old <= 0)
+ return;
- /* If there is nothing to decay, just exit. */
- if (r_old <= 0)
- return;
-
- r_new = r_old * RACCT_DECAY_FACTOR / FSCALE;
- racct->r_resources[resource] = r_new;
+ r_new = r_old * RACCT_DECAY_FACTOR / FSCALE;
+ racct->r_resources[RACCT_PCTCPU] = r_new;
+ }
}
static void
@@ -1185,17 +1249,17 @@
}
static void
-racct_decay(int resource)
+racct_decay(void)
{
ASSERT_RACCT_ENABLED();
- ui_racct_foreach(racct_decay_resource, racct_decay_pre,
- racct_decay_post, &resource, NULL);
- loginclass_racct_foreach(racct_decay_resource, racct_decay_pre,
- racct_decay_post, &resource, NULL);
- prison_racct_foreach(racct_decay_resource, racct_decay_pre,
- racct_decay_post, &resource, NULL);
+ ui_racct_foreach(racct_decay_callback, racct_decay_pre,
+ racct_decay_post, NULL, NULL);
+ loginclass_racct_foreach(racct_decay_callback, racct_decay_pre,
+ racct_decay_post, NULL, NULL);
+ prison_racct_foreach(racct_decay_callback, racct_decay_pre,
+ racct_decay_post, NULL, NULL);
}
static void
@@ -1206,18 +1270,22 @@
struct timeval wallclock;
uint64_t runtime;
uint64_t pct, pct_estimate;
+ bool throttle;
+ //int error;
ASSERT_RACCT_ENABLED();
for (;;) {
- racct_decay(RACCT_PCTCPU);
+ racct_decay();
sx_slock(&allproc_lock);
- LIST_FOREACH(p, &zombproc, p_list) {
- PROC_LOCK(p);
- racct_set(p, RACCT_PCTCPU, 0);
- PROC_UNLOCK(p);
+ if (!racct_pctcpu_throttle_not_deny) {
+ LIST_FOREACH(p, &zombproc, p_list) {
+ PROC_LOCK(p);
+ racct_set(p, RACCT_PCTCPU, 0);
+ PROC_UNLOCK(p);
+ }
}
FOREACH_PROC_IN_SYSTEM(p) {
@@ -1241,24 +1309,63 @@
if (runtime < p->p_prev_runtime)
runtime = p->p_prev_runtime;
#endif
- p->p_prev_runtime = runtime;
- if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) {
- pct_estimate = (1000000 * runtime * 100) /
- ((uint64_t)wallclock.tv_sec * 1000000 +
- wallclock.tv_usec);
- } else
- pct_estimate = 0;
- pct = racct_getpcpu(p, pct_estimate);
+ if (!racct_pctcpu_throttle_not_deny) {
+ if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) {
+ pct_estimate = (1000000 * runtime * 100) /
+ ((uint64_t)wallclock.tv_sec * 1000000 +
+ wallclock.tv_usec);
+ } else
+ pct_estimate = 0;
+ pct = racct_getpcpu(p, pct_estimate);
+ } else {
+ pct = (runtime - p->p_prev_runtime) * 100;
+ }
mtx_lock(&racct_lock);
- racct_set_force_locked(p, RACCT_PCTCPU, pct);
+
+ rctl_throttle_decay(p->p_racct, RACCT_READBPS);
+ rctl_throttle_decay(p->p_racct, RACCT_WRITEBPS);
+ rctl_throttle_decay(p->p_racct, RACCT_READIOPS);
+ rctl_throttle_decay(p->p_racct, RACCT_WRITEIOPS);
+ if (racct_pctcpu_throttle_not_deny) {
+ rctl_throttle_decay(p->p_racct, RACCT_PCTCPU);
+ /*
+ * Preferably we would bump the counters
+ * from statclock. We cannot, however - to call
+ * racct_add() we need the proc lock, and we
+ * cannot acquire that in statclock().
+ *
+ * Note that 1% basically means 10 milliseconds.
+ */
+ racct_add_force_locked(p, RACCT_PCTCPU, pct);
+ } else {
+ racct_set_force_locked(p, RACCT_PCTCPU, pct);
+ }
+ p->p_prev_runtime = runtime;
+
+#if 0
+ if (racct_pctcpu_throttle_not_deny) {
+ /*
+ * Ignore the return value; there cannot
+ * be a "deny" rule there.
+ */
+ error = rctl_enforce(p, RACCT_PCTCPU, 0);
+ KASSERT(error == 0,
+ ("rctl_enforce() returned %d", error));
+ }
+#endif
+
racct_set_locked(p, RACCT_CPU, runtime);
racct_set_locked(p, RACCT_WALLCLOCK,
(uint64_t)wallclock.tv_sec * 1000000 +
wallclock.tv_usec);
+
mtx_unlock(&racct_lock);
PROC_UNLOCK(p);
}
+ if (racct_pctcpu_throttle_not_deny)
+ goto done;
+
/*
* To ensure that processes are throttled in a fair way, we need
* to iterate over all processes again and check the limits
@@ -1272,12 +1379,18 @@
continue;
}
- if (racct_pcpu_available(p) <= 0)
- racct_proc_throttle(p);
- else if (p->p_throttled)
+ throttle = false;
+ if (racct_pcpu_available(p) <= 0 &&
+ (p->p_racct->r_resources[RACCT_PCTCPU] > pcpu_threshold))
+ throttle = true;
+
+ if (throttle)
+ racct_proc_throttle(p, -1);
+ else if (p->p_throttled == -1)
racct_proc_wakeup(p);
PROC_UNLOCK(p);
}
+done:
sx_sunlock(&allproc_lock);
pause("-", hz);
}
Index: sys/kern/kern_rctl.c
===================================================================
--- sys/kern/kern_rctl.c
+++ sys/kern/kern_rctl.c
@@ -77,11 +77,31 @@
#define RCTL_PCPU_SHIFT (10 * 1000000)
-unsigned int rctl_maxbufsize = RCTL_MAX_OUTBUFSIZE;
+static unsigned int rctl_maxbufsize = RCTL_MAX_OUTBUFSIZE;
+static int rctl_log_ratelimit = 10;
+static int rctl_devctl_ratelimit = 10;
+static unsigned int rctl_throttle_min = 0;
+static unsigned int rctl_throttle_max = 0;
+static unsigned int rctl_throttle_pct = 0;
+static unsigned int rctl_throttle_pct2 = 0;
SYSCTL_NODE(_kern_racct, OID_AUTO, rctl, CTLFLAG_RW, 0, "Resource Limits");
SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, maxbufsize, CTLFLAG_RWTUN,
&rctl_maxbufsize, 0, "Maximum output buffer size");
+SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, rctl_log_ratelimit, CTLFLAG_RW,
+ &rctl_log_ratelimit, 0, "Maximum number of log messages per second");
+SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, rctl_devctl_ratelimit, CTLFLAG_RW,
+ &rctl_devctl_ratelimit, 0, "Maximum number of devctl messages per second");
+SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, throttle_min, CTLFLAG_RDTUN,
+ &rctl_throttle_min, 0, "Shortest throttling duration, in hz");
+SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, throttle_max, CTLFLAG_RDTUN,
+ &rctl_throttle_max, 0, "Longest throttling duration, in hz");
+SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, throttle_pct, CTLFLAG_RDTUN,
+ &rctl_throttle_pct, 0,
+ "Throttling penalty for process consumption, in percent");
+SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, throttle_pct2, CTLFLAG_RDTUN,
+ &rctl_throttle_pct2, 0,
+ "Throttling penalty for container consumption, in percent");
/*
* 'rctl_rule_link' connects a rule with every racct it's related to.
@@ -128,6 +148,10 @@
{ "shmsize", RACCT_SHMSIZE },
{ "wallclock", RACCT_WALLCLOCK },
{ "pcpu", RACCT_PCTCPU },
+ { "readbps", RACCT_READBPS },
+ { "writebps", RACCT_WRITEBPS },
+ { "readiops", RACCT_READIOPS },
+ { "writeiops", RACCT_WRITEIOPS },
{ NULL, -1 }};
static struct dict actionnames[] = {
@@ -165,6 +189,7 @@
{ "deny", RCTL_ACTION_DENY },
{ "log", RCTL_ACTION_LOG },
{ "devctl", RCTL_ACTION_DEVCTL },
+ { "throttle", RCTL_ACTION_THROTTLE },
{ NULL, -1 }};
static void rctl_init(void);
@@ -219,66 +244,96 @@
panic("rctl_resource_name: unknown resource %d", resource);
}
-/*
- * Return the amount of resource that can be allocated by 'p' before
- * hitting 'rule'.
- */
-static int64_t
-rctl_available_resource(const struct proc *p, const struct rctl_rule *rule)
+static struct racct *
+rctl_proc_rule_to_racct(const struct proc *p, const struct rctl_rule *rule)
{
- int resource;
- int64_t available = INT64_MAX;
struct ucred *cred = p->p_ucred;
ASSERT_RACCT_ENABLED();
rw_assert(&rctl_lock, RA_LOCKED);
- resource = rule->rr_resource;
switch (rule->rr_per) {
case RCTL_SUBJECT_TYPE_PROCESS:
- available = rule->rr_amount -
- p->p_racct->r_resources[resource];
- break;
+ return (p->p_racct);
case RCTL_SUBJECT_TYPE_USER:
- available = rule->rr_amount -
- cred->cr_ruidinfo->ui_racct->r_resources[resource];
- break;
+ return (cred->cr_ruidinfo->ui_racct);
case RCTL_SUBJECT_TYPE_LOGINCLASS:
- available = rule->rr_amount -
- cred->cr_loginclass->lc_racct->r_resources[resource];
- break;
+ return (cred->cr_loginclass->lc_racct);
case RCTL_SUBJECT_TYPE_JAIL:
- available = rule->rr_amount -
- cred->cr_prison->pr_prison_racct->prr_racct->
- r_resources[resource];
- break;
+ return (cred->cr_prison->pr_prison_racct->prr_racct);
default:
- panic("rctl_compute_available: unknown per %d",
- rule->rr_per);
+ panic("%s: unknown per %d", __func__, rule->rr_per);
}
+}
+
+/*
+ * Return the amount of resource that can be allocated by 'p' before
+ * hitting 'rule'.
+ */
+static int64_t
+rctl_available_resource(const struct proc *p, const struct rctl_rule *rule)
+{
+ int64_t available = INT64_MAX;
+ const struct racct *racct;
+
+ ASSERT_RACCT_ENABLED();
+ rw_assert(&rctl_lock, RA_LOCKED);
+ mtx_assert(&racct_lock, MA_OWNED);
+
+ racct = rctl_proc_rule_to_racct(p, rule);
+ available = rule->rr_amount - racct->r_resources[rule->rr_resource];
return (available);
}
/*
- * Return non-zero if allocating 'amount' by proc 'p' would exceed
- * resource limit specified by 'rule'.
+ * Called every second for proc, uidinfo, loginclass, and jail containers.
+ * If the limit wasn't exceeded, it decreases the usage amount to zero.
+ * Otherwise, it decreases it by the value of the limit. This way
+ * resource consumption exceeding the limit "carries over" to the next
+ * period.
*/
-static int
-rctl_would_exceed(const struct proc *p, const struct rctl_rule *rule,
- int64_t amount)
+void
+rctl_throttle_decay(struct racct *racct, int resource)
{
- int64_t available;
+ struct rctl_rule *rule;
+ struct rctl_rule_link *link;
+ int64_t minavailable;
- ASSERT_RACCT_ENABLED();
+ mtx_assert(&racct_lock, MA_OWNED);
- rw_assert(&rctl_lock, RA_LOCKED);
+ minavailable = INT64_MAX;
- available = rctl_available_resource(p, rule);
- if (available >= amount)
- return (0);
+ rw_rlock(&rctl_lock);
- return (1);
+ LIST_FOREACH(link, &racct->r_rule_links, rrl_next) {
+ rule = link->rrl_rule;
+
+ if (rule->rr_resource != resource)
+ continue;
+ if (rule->rr_action != RCTL_ACTION_THROTTLE)
+ continue;
+
+ if (rule->rr_amount < minavailable)
+ minavailable = rule->rr_amount;
+ }
+
+ rw_runlock(&rctl_lock);
+
+ if (racct->r_resources[resource] < minavailable) {
+ racct->r_resources[resource] = 0;
+ } else {
+ /*
+ * Cap utilization counter at ten times the limit. Otherwise,
+ * if we accumulated the resource before adding the limiting
+ * rule, it could take unreasonably long time for the counter
+ * value to drop.
+ */
+ if (racct->r_resources[resource] > minavailable * 10)
+ racct->r_resources[resource] = minavailable * 10;
+ else
+ racct->r_resources[resource] -= minavailable;
+ }
}
/*
@@ -328,6 +383,38 @@
return (minavailable);
}
+static uint64_t
+xadd(uint64_t a, uint64_t b)
+{
+ uint64_t c;
+
+ c = a + b;
+
+ /*
+ * Detect overflow.
+ */
+ if (c < a || c < b)
+ return (UINT64_MAX);
+
+ return (c);
+}
+
+static uint64_t
+xmul(uint64_t a, uint64_t b)
+{
+ uint64_t c;
+
+ if (a == 0 || b == 0)
+ return (0);
+
+ c = a * b;
+
+ if (c < a || c < b)
+ return (UINT64_MAX);
+
+ return (c);
+}
+
/*
* Check whether the proc 'p' can allocate 'amount' of 'resource' in addition
* to what it keeps allocated now. Returns non-zero if the allocation should
@@ -339,12 +426,16 @@
struct rctl_rule *rule;
struct rctl_rule_link *link;
struct sbuf sb;
+ int64_t available;
+ uint64_t sleep_ms, sleep_ratio;
int should_deny = 0;
char *buf;
- static int curtime = 0;
- static struct timeval lasttime;
+
+ static int log_curtime = 0, devctl_curtime = 0;
+ static struct timeval log_lasttime, devctl_lasttime;
ASSERT_RACCT_ENABLED();
+ mtx_assert(&racct_lock, MA_OWNED);
rw_rlock(&rctl_lock);
@@ -356,7 +447,9 @@
rule = link->rrl_rule;
if (rule->rr_resource != resource)
continue;
- if (!rctl_would_exceed(p, rule, amount)) {
+
+ available = rctl_available_resource(p, rule);
+ if (available >= (int64_t)amount) {
link->rrl_exceeded = 0;
continue;
}
@@ -383,7 +476,8 @@
if (p->p_state != PRS_NORMAL)
continue;
- if (!ppsratecheck(&lasttime, &curtime, 10))
+ if (!ppsratecheck(&log_lasttime, &log_curtime,
+ rctl_log_ratelimit))
continue;
buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT);
@@ -408,7 +502,11 @@
if (p->p_state != PRS_NORMAL)
continue;
-
+
+ if (!ppsratecheck(&devctl_lasttime, &devctl_curtime,
+ rctl_devctl_ratelimit))
+ continue;
+
buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT);
if (buf == NULL) {
printf("rctl_enforce: out of memory\n");
@@ -427,6 +525,69 @@
free(buf, M_RCTL);
link->rrl_exceeded = 1;
continue;
+ case RCTL_ACTION_THROTTLE:
+ if (p->p_state != PRS_NORMAL)
+ continue;
+
+ /*
+ * Make the process sleep for a fraction of second
+ * proportional to the ratio of process' resource
+ * utilization compared to the limit. The point is
+ * to penalize resource hogs: processes that consume
+ * more of the available resources sleep for longer.
+ *
+ * We're trying to defer division until the very end,
+ * to minimize the rounding effects. The following
+ * calculation could have been written in a clearer
+ * way like this:
+ *
+ * sleep_ms = hz * p->p_racct->r_resources[resource] /
+ * rule->rr_amount;
+ * sleep_ms *= rctl_throttle_pct / 100;
+ * if (sleep_ms < rctl_throttle_min)
+ * sleep_ms = rctl_throttle_min;
+ *
+ */
+ sleep_ms = xmul(hz, p->p_racct->r_resources[resource]);
+ sleep_ms = xmul(sleep_ms, rctl_throttle_pct) / 100;
+ if (sleep_ms < rctl_throttle_min * rule->rr_amount)
+ sleep_ms = rctl_throttle_min * rule->rr_amount;
+
+ /*
+ * Multiply that by the ratio of the resource
+ * consumption for the container compared to the limit,
+ * squared. In other words, a process in a container
+ * that is two times over the limit will be throttled
+ * four times as much for hitting the same rule. The
+ * point is to penalize processes more if the container
+ * itself (eg certain UID or jail) is above the limit.
+ */
+ if (available < 0)
+ sleep_ratio = -available / rule->rr_amount;
+ else
+ sleep_ratio = 0;
+ sleep_ratio = xmul(sleep_ratio, sleep_ratio);
+ sleep_ratio = xmul(sleep_ratio, rctl_throttle_pct2) / 100;
+ sleep_ms = xadd(sleep_ms, xmul(sleep_ms, sleep_ratio));
+
+ /*
+ * Finally the division.
+ */
+ sleep_ms /= rule->rr_amount;
+
+ if (sleep_ms > rctl_throttle_max)
+ sleep_ms = rctl_throttle_max;
+#if 0
+ printf("%s: pid %d (%s), %jd of %jd, will sleep for %ld ms (ratio %ld, available %ld)\n",
+ __func__, p->p_pid, p->p_comm,
+ p->p_racct->r_resources[resource],
+ rule->rr_amount, sleep_ms, sleep_ratio, available);
+#endif
+
+ KASSERT(sleep_ms >= rctl_throttle_min, ("%s: %ld < %d\n",
+ __func__, sleep_ms, rctl_throttle_min));
+ racct_proc_throttle(p, sleep_ms);
+ continue;
default:
if (link->rrl_exceeded != 0)
continue;
@@ -642,6 +803,9 @@
if ((size_t)(end - str) != strlen(str))
return (EINVAL);
+ if (*value <= 0)
+ return (ERANGE);
+
return (0);
}
@@ -1008,8 +1172,13 @@
error = str2int64(amountstr, &rule->rr_amount);
if (error != 0)
goto out;
- if (RACCT_IS_IN_MILLIONS(rule->rr_resource))
+ if (RACCT_IS_IN_MILLIONS(rule->rr_resource)) {
+ if (rule->rr_amount > INT64_MAX / 1000000) {
+ error = ERANGE;
+ goto out;
+ }
rule->rr_amount *= 1000000;
+ }
}
if (perstr == NULL || perstr[0] == '\0')
@@ -1048,20 +1217,27 @@
KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
/*
- * Some rules just don't make sense. Note that the one below
- * cannot be rewritten using RACCT_IS_DENIABLE(); the RACCT_PCTCPU,
- * for example, is not deniable in the racct sense, but the
- * limit is enforced in a different way, so "deny" rules for %CPU
- * do make sense.
+ * Some rules just don't make sense, like "deny" rule for an undeniable
+ * resource. The exception are the RSS and %CPU resources - they are
+ * not deniable in the racct sense, but the limit is enforced in
+ * a different way.
*/
if (rule->rr_action == RCTL_ACTION_DENY &&
- (rule->rr_resource == RACCT_CPU ||
- rule->rr_resource == RACCT_WALLCLOCK))
+ !RACCT_IS_DENIABLE(rule->rr_resource) &&
+ rule->rr_resource != RACCT_RSS &&
+ rule->rr_resource != RACCT_PCTCPU) {
return (EOPNOTSUPP);
+ }
+
+ if (rule->rr_action == RCTL_ACTION_THROTTLE &&
+ !RACCT_IS_DECAYING(rule->rr_resource)) {
+ return (EOPNOTSUPP);
+ }
if (rule->rr_per == RCTL_SUBJECT_TYPE_PROCESS &&
- RACCT_IS_SLOPPY(rule->rr_resource))
+ RACCT_IS_SLOPPY(rule->rr_resource)) {
return (EOPNOTSUPP);
+ }
/*
* Make sure there are no duplicated rules. Also, for the "deny"
@@ -1346,7 +1522,9 @@
for (i = 0; i <= RACCT_MAX; i++) {
if (sloppy == 0 && RACCT_IS_SLOPPY(i))
continue;
+ mtx_lock(&racct_lock);
amount = racct->r_resources[i];
+ mtx_unlock(&racct_lock);
if (RACCT_IS_IN_MILLIONS(i))
amount /= 1000000;
sbuf_printf(sb, "%s=%jd,", rctl_resource_name(i), amount);
@@ -1619,6 +1797,81 @@
return (error);
}
+static void
+rctl_rule_find_callback(struct racct *racct, void *arg2, void *arg3)
+{
+ struct rctl_rule *filter = (struct rctl_rule *)arg2;
+ struct rctl_rule_link *link;
+ int found = 0;
+
+ ASSERT_RACCT_ENABLED();
+ rw_assert(&rctl_lock, RA_LOCKED);
+
+ LIST_FOREACH(link, &racct->r_rule_links, rrl_next) {
+ if (!rctl_rule_matches(link->rrl_rule, filter))
+ continue;
+ found++;
+ }
+
+ *((int *)arg3) += found;
+}
+
+/*
+ * Try to set racct_pctcpu_throttle_not_deny to the value given.
+ * It can only be set to '1' if there are no 'pcpu:deny' rules,
+ * and to '0' only if there are no 'pcpu:throttle" rules.
+ */
+static int
+rctl_set_pctcpu_throttle_not_deny(int val)
+{
+ struct rctl_rule *filter;
+ struct proc *p;
+ int found;
+
+ if (racct_pctcpu_throttle_not_deny == val)
+ return (0);
+
+ /*
+ * The purpose of this lock here is not only to be able to iterate
+ * over processes, but also to avoid race with rctl_rule_add()
+ * adding a clashing rule after we do the search, but before we set
+ * the value. Besides, rctl_rule_add() holds this lock anyway.
+ */
+ sx_assert(&allproc_lock, SA_LOCKED);
+
+ filter = rctl_rule_alloc(M_WAITOK);
+ filter->rr_resource = RACCT_PCTCPU;
+ if (val == 1)
+ filter->rr_action = RCTL_ACTION_DENY;
+ else
+ filter->rr_action = RCTL_ACTION_THROTTLE;
+
+ found = 0;
+
+ rctl_rule_pre_callback();
+ FOREACH_PROC_IN_SYSTEM(p)
+ rctl_rule_find_callback(p->p_racct, filter, &found);
+ rctl_rule_post_callback();
+
+ loginclass_racct_foreach(rctl_rule_find_callback,
+ rctl_rule_pre_callback, rctl_rule_post_callback,
+ filter, &found);
+ ui_racct_foreach(rctl_rule_find_callback,
+ rctl_rule_pre_callback, rctl_rule_post_callback,
+ filter, &found);
+ prison_racct_foreach(rctl_rule_find_callback,
+ rctl_rule_pre_callback, rctl_rule_post_callback,
+ filter, &found);
+
+ rctl_rule_release(filter);
+
+ if (found != 0)
+ return (EOPNOTSUPP);
+
+ racct_pctcpu_throttle_not_deny = val;
+ return (0);
+}
+
int
sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap)
{
@@ -1656,6 +1909,19 @@
goto out;
}
+ if (rule->rr_resource == RACCT_PCTCPU) {
+ if (rule->rr_action == RCTL_ACTION_DENY) {
+ error = rctl_set_pctcpu_throttle_not_deny(0);
+ if (error != 0)
+ goto out;
+ }
+ if (rule->rr_action == RCTL_ACTION_THROTTLE) {
+ error = rctl_set_pctcpu_throttle_not_deny(1);
+ if (error != 0)
+ goto out;
+ }
+ }
+
error = rctl_rule_add(rule);
out:
@@ -1935,6 +2201,15 @@
UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
rctl_rule_zone = uma_zcreate("rctl_rule", sizeof(struct rctl_rule),
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+
+ if (rctl_throttle_min <= 0)
+ rctl_throttle_min = 1;
+ if (rctl_throttle_max <= 0)
+ rctl_throttle_max = 2 * hz;
+ if (rctl_throttle_pct <= 0)
+ rctl_throttle_pct = 100;
+ if (rctl_throttle_pct2 <= 0)
+ rctl_throttle_pct2 = 100;
}
#else /* !RCTL */
Index: sys/kern/subr_trap.c
===================================================================
--- sys/kern/subr_trap.c
+++ sys/kern/subr_trap.c
@@ -172,10 +172,14 @@
(td->td_vnet_lpush != NULL) ? td->td_vnet_lpush : "N/A"));
#endif
#ifdef RACCT
- if (racct_enable && p->p_throttled == 1) {
+ if (racct_enable && p->p_throttled != 0) {
PROC_LOCK(p);
- while (p->p_throttled == 1)
- msleep(p->p_racct, &p->p_mtx, 0, "racct", 0);
+ while (p->p_throttled != 0) {
+ msleep(p->p_racct, &p->p_mtx, 0, "racct",
+ p->p_throttled < 0 ? 0 : p->p_throttled);
+ if (p->p_throttled > 0)
+ p->p_throttled = 0;
+ }
PROC_UNLOCK(p);
}
#endif
Index: sys/kern/vfs_bio.c
===================================================================
--- sys/kern/vfs_bio.c
+++ sys/kern/vfs_bio.c
@@ -61,6 +61,7 @@
#include <sys/kernel.h>
#include <sys/kthread.h>
#include <sys/proc.h>
+#include <sys/racct.h>
#include <sys/resourcevar.h>
#include <sys/rwlock.h>
#include <sys/smp.h>
@@ -1784,8 +1785,16 @@
rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0);
if ((rabp->b_flags & B_CACHE) == 0) {
- if (!TD_IS_IDLETHREAD(curthread))
+ if (!TD_IS_IDLETHREAD(curthread)) {
+#ifdef RACCT
+ if (racct_enable) {
+ PROC_LOCK(curproc);
+ racct_add_buf(curproc, rabp, 0);
+ PROC_UNLOCK(curproc);
+ }
+#endif /* RACCT */
curthread->td_ru.ru_inblock++;
+ }
rabp->b_flags |= B_ASYNC;
rabp->b_flags &= ~B_INVAL;
rabp->b_ioflags &= ~BIO_ERROR;
@@ -1829,8 +1838,16 @@
/* if not found in cache, do some I/O */
if ((bp->b_flags & B_CACHE) == 0) {
- if (!TD_IS_IDLETHREAD(curthread))
+ if (!TD_IS_IDLETHREAD(curthread)) {
+#ifdef RACCT
+ if (racct_enable) {
+ PROC_LOCK(curproc);
+ racct_add_buf(curproc, bp, 0);
+ PROC_UNLOCK(curproc);
+ }
+#endif /* RACCT */
curthread->td_ru.ru_inblock++;
+ }
bp->b_iocmd = BIO_READ;
bp->b_flags &= ~B_INVAL;
bp->b_ioflags &= ~BIO_ERROR;
@@ -1926,8 +1943,16 @@
bp->b_runningbufspace = bp->b_bufsize;
space = atomic_fetchadd_long(&runningbufspace, bp->b_runningbufspace);
- if (!TD_IS_IDLETHREAD(curthread))
+ if (!TD_IS_IDLETHREAD(curthread)) {
+#ifdef RACCT
+ if (racct_enable) {
+ PROC_LOCK(curproc);
+ racct_add_buf(curproc, bp, 1);
+ PROC_UNLOCK(curproc);
+ }
+#endif /* RACCT */
curthread->td_ru.ru_oublock++;
+ }
if (oldflags & B_ASYNC)
BUF_KERNPROC(bp);
bp->b_iooffset = dbtob(bp->b_blkno);
Index: sys/kern/vfs_cluster.c
===================================================================
--- sys/kern/vfs_cluster.c
+++ sys/kern/vfs_cluster.c
@@ -45,6 +45,7 @@
#include <sys/vnode.h>
#include <sys/malloc.h>
#include <sys/mount.h>
+#include <sys/racct.h>
#include <sys/resourcevar.h>
#include <sys/rwlock.h>
#include <sys/vmmeter.h>
@@ -241,6 +242,13 @@
BUF_KERNPROC(bp);
bp->b_iooffset = dbtob(bp->b_blkno);
bstrategy(bp);
+#ifdef RACCT
+ if (racct_enable) {
+ PROC_LOCK(curproc);
+ racct_add_buf(curproc, bp, 0);
+ PROC_UNLOCK(curproc);
+ }
+#endif /* RACCT */
curthread->td_ru.ru_inblock++;
}
@@ -294,6 +302,13 @@
BUF_KERNPROC(rbp);
rbp->b_iooffset = dbtob(rbp->b_blkno);
bstrategy(rbp);
+#ifdef RACCT
+ if (racct_enable) {
+ PROC_LOCK(curproc);
+ racct_add_buf(curproc, rbp, 0);
+ PROC_UNLOCK(curproc);
+ }
+#endif /* RACCT */
curthread->td_ru.ru_inblock++;
}
Index: sys/sys/proc.h
===================================================================
--- sys/sys/proc.h
+++ sys/sys/proc.h
@@ -620,7 +620,7 @@
after fork. */
uint64_t p_prev_runtime; /* (c) Resource usage accounting. */
struct racct *p_racct; /* (b) Resource accounting. */
- u_char p_throttled; /* (c) Flag for racct pcpu throttling */
+ int p_throttled; /* (c) Flag for racct pcpu throttling */
struct vm_domain_policy p_vm_dom_policy; /* (c) process default VM domain, or -1 */
/*
* An orphan is the child that has beed re-parented to the
Index: sys/sys/racct.h
===================================================================
--- sys/sys/racct.h
+++ sys/sys/racct.h
@@ -42,6 +42,7 @@
#include <sys/stdint.h>
#include <sys/sysctl.h>
+struct buf;
struct proc;
struct rctl_rule_link;
struct ucred;
@@ -71,7 +72,11 @@
#define RACCT_SHMSIZE 18
#define RACCT_WALLCLOCK 19
#define RACCT_PCTCPU 20
-#define RACCT_MAX RACCT_PCTCPU
+#define RACCT_READBPS 21
+#define RACCT_WRITEBPS 22
+#define RACCT_READIOPS 23
+#define RACCT_WRITEIOPS 24
+#define RACCT_MAX RACCT_WRITEIOPS
/*
* Resource properties.
@@ -85,6 +90,7 @@
extern int racct_types[];
extern int racct_enable;
+extern int racct_pctcpu_throttle_not_deny;
#define ASSERT_RACCT_ENABLED() KASSERT(racct_enable, \
("%s called with !racct_enable", __func__))
@@ -98,7 +104,7 @@
/*
* Resource usage can drop, as opposed to only grow. When the process
- * terminates, its resource usage is freed from the respective
+ * terminates, its resource usage is subtracted from the respective
* per-credential racct containers.
*/
#define RACCT_IS_RECLAIMABLE(X) (racct_types[X] & RACCT_RECLAIMABLE)
@@ -126,8 +132,7 @@
* When a process terminates, its resource usage is not automatically
* subtracted from per-credential racct containers. Instead, the resource
* usage of per-credential racct containers decays in time.
- * Resource usage can olso drop for such resource.
- * So far, the only such resource is RACCT_PCTCPU.
+ * Resource usage can also drop for such resource.
*/
#define RACCT_IS_DECAYING(X) (racct_types[X] & RACCT_DECAYING)
@@ -149,11 +154,14 @@
SYSCTL_DECL(_kern_racct);
+extern struct mtx racct_lock;
+
#ifdef RACCT
int racct_add(struct proc *p, int resource, uint64_t amount);
void racct_add_cred(struct ucred *cred, int resource, uint64_t amount);
void racct_add_force(struct proc *p, int resource, uint64_t amount);
+void racct_add_buf(struct proc *p, const struct buf *bufp, int is_write);
int racct_set(struct proc *p, int resource, uint64_t amount);
void racct_set_force(struct proc *p, int resource, uint64_t amount);
void racct_sub(struct proc *p, int resource, uint64_t amount);
@@ -171,6 +179,7 @@
void racct_proc_ucred_changed(struct proc *p, struct ucred *oldcred,
struct ucred *newcred);
void racct_move(struct racct *dest, struct racct *src);
+void racct_proc_throttle(struct proc *p, int timeout);
#else
Index: sys/sys/rctl.h
===================================================================
--- sys/sys/rctl.h
+++ sys/sys/rctl.h
@@ -129,7 +129,8 @@
#define RCTL_ACTION_DENY (RCTL_ACTION_SIGNAL_MAX + 1)
#define RCTL_ACTION_LOG (RCTL_ACTION_SIGNAL_MAX + 2)
#define RCTL_ACTION_DEVCTL (RCTL_ACTION_SIGNAL_MAX + 3)
-#define RCTL_ACTION_MAX RCTL_ACTION_DEVCTL
+#define RCTL_ACTION_THROTTLE (RCTL_ACTION_SIGNAL_MAX + 4)
+#define RCTL_ACTION_MAX RCTL_ACTION_THROTTLE
#define RCTL_AMOUNT_UNDEFINED -1
@@ -140,6 +141,7 @@
int rctl_rule_add(struct rctl_rule *rule);
int rctl_rule_remove(struct rctl_rule *filter);
int rctl_enforce(struct proc *p, int resource, uint64_t amount);
+void rctl_throttle_decay(struct racct *racct, int resource);
int64_t rctl_pcpu_available(const struct proc *p);
uint64_t rctl_get_limit(struct proc *p, int resource);
uint64_t rctl_get_available(struct proc *p, int resource);
Index: sys/ufs/ffs/ffs_inode.c
===================================================================
--- sys/ufs/ffs/ffs_inode.c
+++ sys/ufs/ffs/ffs_inode.c
@@ -41,6 +41,7 @@
#include <sys/malloc.h>
#include <sys/mount.h>
#include <sys/proc.h>
+#include <sys/racct.h>
#include <sys/random.h>
#include <sys/resourcevar.h>
#include <sys/rwlock.h>
@@ -616,6 +617,13 @@
vp = ITOV(ip);
bp = getblk(vp, lbn, (int)fs->fs_bsize, 0, 0, 0);
if ((bp->b_flags & B_CACHE) == 0) {
+#ifdef RACCT
+ if (racct_enable) {
+ PROC_LOCK(curproc);
+ racct_add_buf(curproc, bp, 0);
+ PROC_UNLOCK(curproc);
+ }
+#endif /* RACCT */
curthread->td_ru.ru_inblock++; /* pay for read */
bp->b_iocmd = BIO_READ;
bp->b_flags &= ~B_INVAL;
Index: sys/ufs/ffs/ffs_softdep.c
===================================================================
--- sys/ufs/ffs/ffs_softdep.c
+++ sys/ufs/ffs/ffs_softdep.c
@@ -69,6 +69,7 @@
#include <sys/namei.h>
#include <sys/priv.h>
#include <sys/proc.h>
+#include <sys/racct.h>
#include <sys/rwlock.h>
#include <sys/stat.h>
#include <sys/sysctl.h>
@@ -6229,6 +6230,13 @@
vfs_busy_pages(bp, 0);
bp->b_iooffset = dbtob(bp->b_blkno);
bstrategy(bp);
+#ifdef RACCT
+ if (racct_enable) {
+ PROC_LOCK(curproc);
+ racct_add_buf(curproc, bp, 0);
+ PROC_UNLOCK(curproc);
+ }
+#endif /* RACCT */
curthread->td_ru.ru_inblock++;
error = bufwait(bp);
if (error) {
Index: sys/ufs/ufs/ufs_bmap.c
===================================================================
--- sys/ufs/ufs/ufs_bmap.c
+++ sys/ufs/ufs/ufs_bmap.c
@@ -44,6 +44,7 @@
#include <sys/proc.h>
#include <sys/vnode.h>
#include <sys/mount.h>
+#include <sys/racct.h>
#include <sys/resourcevar.h>
#include <sys/stat.h>
@@ -223,6 +224,13 @@
vfs_busy_pages(bp, 0);
bp->b_iooffset = dbtob(bp->b_blkno);
bstrategy(bp);
+#ifdef RACCT
+ if (racct_enable) {
+ PROC_LOCK(curproc);
+ racct_add_buf(curproc, bp, 0);
+ PROC_UNLOCK(curproc);
+ }
+#endif /* RACCT */
curthread->td_ru.ru_inblock++;
error = bufwait(bp);
if (error) {
Index: sys/vm/vm_fault.c
===================================================================
--- sys/vm/vm_fault.c
+++ sys/vm/vm_fault.c
@@ -83,6 +83,7 @@
#include <sys/lock.h>
#include <sys/mman.h>
#include <sys/proc.h>
+#include <sys/racct.h>
#include <sys/resourcevar.h>
#include <sys/rwlock.h>
#include <sys/sysctl.h>
@@ -994,6 +995,21 @@
if (hardfault) {
PCPU_INC(cnt.v_io_faults);
curthread->td_ru.ru_majflt++;
+#ifdef RACCT
+ if (racct_enable && fs.object->type == OBJT_VNODE) {
+ PROC_LOCK(curproc);
+ if ((fault_type & (VM_PROT_COPY | VM_PROT_WRITE)) != 0) {
+ racct_add_force(curproc, RACCT_WRITEBPS,
+ PAGE_SIZE + behind * PAGE_SIZE);
+ racct_add_force(curproc, RACCT_WRITEIOPS, 1);
+ } else {
+ racct_add_force(curproc, RACCT_READBPS,
+ PAGE_SIZE + ahead * PAGE_SIZE);
+ racct_add_force(curproc, RACCT_READIOPS, 1);
+ }
+ PROC_UNLOCK(curproc);
+ }
+#endif
} else
curthread->td_ru.ru_minflt++;
Index: usr.bin/rctl/rctl.8
===================================================================
--- usr.bin/rctl/rctl.8
+++ usr.bin/rctl/rctl.8
@@ -25,7 +25,7 @@
.\"
.\" $FreeBSD$
.\"
-.Dd November 29, 2015
+.Dd January 30, 2016
.Dt RCTL 8
.Os
.Sh NAME
@@ -204,14 +204,22 @@
.It Sy shmsize Ta "SysV shared memory size, in bytes"
.It Sy wallclock Ta "wallclock time, in seconds"
.It Sy pcpu Ta "%CPU, in percents of a single CPU core"
+.It Sy readbps Ta "filesystem reads, in bytes per second"
+.It Sy writebps Ta "filesystem writes, in bytes per second"
+.It Sy readiops Ta "filesystem reads, in operations per second"
+.It Sy writeiops Ta "filesystem writes, in operations per second"
.El
.Sh ACTIONS
.Bl -column -offset 3n "pseudoterminals"
.It Em action
.It Sy deny Ta deny the allocation; not supported for
-.Sy cputime
+.Sy cputime ,
+.Sy wallclock ,
+.Sy readbps ,
+.Sy writebps ,
+.Sy readiops ,
and
-.Sy wallclock
+.Sy writeiops
.It Sy log Ta "log a warning to the console"
.It Sy devctl Ta "send notification to"
.Xr devd 8
@@ -228,6 +236,19 @@
See
.Xr signal 3
for a list of supported signals
+.It Sy throttle Ta "slow down process execution"; only supported for
+.Sy readbps ,
+.Sy writebps ,
+.Sy readiops ,
+and
+.Sy writeiops .
+Note that one cannot have both
+.Sy throttle
+and
+.Sy deny
+rules for the
+.Sy pcpu
+resource at the same time.
.El
.Pp
Not all actions are supported for all resources.
@@ -287,3 +308,22 @@
Limiting
.Sy memoryuse
may kill the machine due to thrashing.
+.Pp
+The
+.Sy readiops
+and
+.Sy writeiops
+counters are only approximations.
+Like
+.Sy readbps
+and
+.Sy writebps ,
+they are calculated in the filesystem layer, where it is difficult
+or even impossible to observe actual disk device operations.
+.Pp
+The
+.Sy writebps
+and
+.Sy writeiops
+resources generally account for writes to the filesystem cache,
+not to actual devices.

File Metadata

Mime Type
text/plain
Expires
Wed, Apr 22, 8:48 AM (19 h, 35 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
31966107
Default Alt Text
D5080.id12953.diff (43 KB)

Event Timeline