Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F153426819
D5080.id12715.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
43 KB
Referenced Files
None
Subscribers
None
D5080.id12715.diff
View Options
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
+++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
@@ -132,6 +132,7 @@
#include <sys/multilist.h>
#ifdef _KERNEL
#include <sys/dnlc.h>
+#include <sys/racct.h>
#endif
#include <sys/callb.h>
#include <sys/kstat.h>
@@ -4503,6 +4504,18 @@
demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
data, metadata, misses);
#ifdef _KERNEL
+#ifdef RACCT
+ if (racct_enable) {
+#if 0
+ printf("%s: adding %jd bytes for %d (%s)\n",
+ __func__, (uintmax_t)size, curproc->p_pid, curproc->p_comm);
+#endif
+ PROC_LOCK(curproc);
+ racct_add_force(curproc, RACCT_READBPS, size);
+ racct_add_force(curproc, RACCT_READIOPS, 1);
+ PROC_UNLOCK(curproc);
+ }
+#endif /* RACCT */
curthread->td_ru.ru_inblock++;
#endif
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
+++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
@@ -47,6 +47,7 @@
#include <sys/sa.h>
#include <sys/zfeature.h>
#ifdef _KERNEL
+#include <sys/racct.h>
#include <sys/vm.h>
#include <sys/zfs_znode.h>
#endif
@@ -427,6 +428,15 @@
}
dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
+#if defined(_KERNEL) && defined(RACCT)
+ if (racct_enable && !read) {
+ PROC_LOCK(curproc);
+ racct_add_force(curproc, RACCT_WRITEBPS, length);
+ racct_add_force(curproc, RACCT_WRITEIOPS, nblks);
+ PROC_UNLOCK(curproc);
+ }
+#endif
+
zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL);
blkid = dbuf_whichblock(dn, 0, offset);
for (i = 0; i < nblks; i++) {
@@ -1422,7 +1432,15 @@
DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA) {
#ifdef _KERNEL
curthread->td_ru.ru_oublock++;
-#endif
+#ifdef RACCT
+ if (racct_enable) {
+ PROC_LOCK(curproc);
+ racct_add_force(curproc, RACCT_WRITEBPS, blksz);
+ racct_add_force(curproc, RACCT_WRITEIOPS, 1);
+ PROC_UNLOCK(curproc);
+ }
+#endif /* RACCT */
+#endif /* _KERNEL */
dbuf_assign_arcbuf(db, buf, tx);
dbuf_rele(db, FTAG);
} else {
Index: sys/fs/ext2fs/ext2_bmap.c
===================================================================
--- sys/fs/ext2fs/ext2_bmap.c
+++ sys/fs/ext2fs/ext2_bmap.c
@@ -42,6 +42,7 @@
#include <sys/proc.h>
#include <sys/vnode.h>
#include <sys/mount.h>
+#include <sys/racct.h>
#include <sys/resourcevar.h>
#include <sys/stat.h>
@@ -243,6 +244,13 @@
vfs_busy_pages(bp, 0);
bp->b_iooffset = dbtob(bp->b_blkno);
bstrategy(bp);
+#ifdef RACCT
+ if (racct_enable) {
+ PROC_LOCK(curproc);
+ racct_add_buf(curproc, bp, 0);
+ PROC_UNLOCK(curproc);
+ }
+#endif
curthread->td_ru.ru_inblock++;
error = bufwait(bp);
if (error) {
Index: sys/kern/kern_physio.c
===================================================================
--- sys/kern/kern_physio.c
+++ sys/kern/kern_physio.c
@@ -27,6 +27,7 @@
#include <sys/conf.h>
#include <sys/malloc.h>
#include <sys/proc.h>
+#include <sys/racct.h>
#include <sys/uio.h>
#include <geom/geom.h>
@@ -109,6 +110,22 @@
prot |= VM_PROT_WRITE; /* Less backwards than it looks */
error = 0;
for (i = 0; i < uio->uio_iovcnt; i++) {
+#ifdef RACCT
+ if (racct_enable) {
+ PROC_LOCK(curproc);
+ if (uio->uio_rw == UIO_READ) {
+ racct_add_force(curproc, RACCT_READBPS,
+ uio->uio_iov[i].iov_len);
+ racct_add_force(curproc, RACCT_READIOPS, 1);
+ } else {
+ racct_add_force(curproc, RACCT_WRITEBPS,
+ uio->uio_iov[i].iov_len);
+ racct_add_force(curproc, RACCT_WRITEIOPS, 1);
+ }
+ PROC_UNLOCK(curproc);
+ }
+#endif /* RACCT */
+
while (uio->uio_iov[i].iov_len) {
bzero(bp, sizeof(*bp));
if (uio->uio_rw == UIO_READ) {
Index: sys/kern/kern_racct.c
===================================================================
--- sys/kern/kern_racct.c
+++ sys/kern/kern_racct.c
@@ -35,6 +35,7 @@
#include "opt_sched.h"
#include <sys/param.h>
+#include <sys/buf.h>
#include <sys/systm.h>
#include <sys/eventhandler.h>
#include <sys/jail.h>
@@ -66,21 +67,25 @@
FEATURE(racct, "Resource Accounting");
-/*
- * Do not block processes that have their %cpu usage <= pcpu_threshold.
- */
-static int pcpu_threshold = 1;
#ifdef RACCT_DEFAULT_TO_DISABLED
int racct_enable = 0;
#else
int racct_enable = 1;
#endif
+/*
+ * Do not block processes that have their %cpu usage <= pcpu_threshold.
+ */
+static int pcpu_threshold = 1;
+int racct_pctcpu_throttle_not_deny = 0;
SYSCTL_NODE(_kern, OID_AUTO, racct, CTLFLAG_RW, 0, "Resource Accounting");
SYSCTL_UINT(_kern_racct, OID_AUTO, enable, CTLFLAG_RDTUN, &racct_enable,
0, "Enable RACCT/RCTL");
SYSCTL_UINT(_kern_racct, OID_AUTO, pcpu_threshold, CTLFLAG_RW, &pcpu_threshold,
- 0, "Processes with higher %cpu usage than this value can be throttled.");
+ 0, "\"pcpu:deny\" rules don't affect processes with %cpu below this value");
+SYSCTL_UINT(_kern_racct, OID_AUTO, pcpu_throttle_not_deny, CTLFLAG_RD,
+ &racct_pctcpu_throttle_not_deny,
+ 0, "Use \"pcpu:throttle\" rules instead of \"pcpu:deny\"");
/*
* How many seconds it takes to use the scheduler %cpu calculations. When a
@@ -90,7 +95,7 @@
*/
#define RACCT_PCPU_SECS 3
-static struct mtx racct_lock;
+struct mtx racct_lock;
MTX_SYSINIT(racct_lock, &racct_lock, "racct lock", MTX_DEF);
static uma_zone_t racct_zone;
@@ -171,7 +176,16 @@
[RACCT_WALLCLOCK] =
RACCT_IN_MILLIONS,
[RACCT_PCTCPU] =
- RACCT_DECAYING | RACCT_DENIABLE | RACCT_IN_MILLIONS };
+ RACCT_RECLAIMABLE | RACCT_DECAYING | RACCT_DENIABLE |
+ RACCT_IN_MILLIONS,
+ [RACCT_READBPS] =
+ RACCT_DECAYING,
+ [RACCT_WRITEBPS] =
+ RACCT_DECAYING,
+ [RACCT_READIOPS] =
+ RACCT_DECAYING,
+ [RACCT_WRITEIOPS] =
+ RACCT_DECAYING };
static const fixpt_t RACCT_DECAY_FACTOR = 0.3 * FSCALE;
@@ -473,6 +487,8 @@
for (i = 0; i <= RACCT_MAX; i++) {
if (RACCT_IS_SLOPPY(i))
continue;
+ if (RACCT_IS_DECAYING(i))
+ continue;
if (!RACCT_IS_RECLAIMABLE(i))
continue;
KASSERT(racct->r_resources[i] == 0,
@@ -498,8 +514,8 @@
/*
* Increase consumption of 'resource' by 'amount' for 'racct',
- * but not its parents. Differently from other cases, 'amount' here
- * may be less than zero.
+ * but not its parents. Differently from other cases, 'amount'
+ * here may be less than zero.
*/
static void
racct_adjust_resource(struct racct *racct, int resource,
@@ -526,9 +542,10 @@
* returns for a thread more than 100% cpu usage. So we set a sane
* boundary here to 100% * the maxumum number of CPUs.
*/
- if ((resource == RACCT_PCTCPU) &&
- (racct->r_resources[RACCT_PCTCPU] > 100 * 1000000 * (int64_t)MAXCPU))
+ if (!racct_pctcpu_throttle_not_deny && resource == RACCT_PCTCPU &&
+ racct->r_resources[RACCT_PCTCPU] > 100 * 1000000 * (int64_t)MAXCPU) {
racct->r_resources[RACCT_PCTCPU] = 100 * 1000000 * (int64_t)MAXCPU;
+ }
}
static int
@@ -612,16 +629,11 @@
mtx_unlock(&racct_lock);
}
-/*
- * Increase allocation of 'resource' by 'amount' for process 'p'.
- * Doesn't check for limits and never fails.
- */
-void
-racct_add_force(struct proc *p, int resource, uint64_t amount)
+static void
+racct_add_force_locked(struct proc *p, int resource, uint64_t amount)
{
- if (!racct_enable)
- return;
+ ASSERT_RACCT_ENABLED();
SDT_PROBE3(racct, , rusage, add__force, p, resource, amount);
@@ -630,12 +642,48 @@
*/
PROC_LOCK_ASSERT(p, MA_OWNED);
- mtx_lock(&racct_lock);
+ /*
+ * Ignore the return value - we can't return error, but RCTL might
+ * eg. throttle the process.
+ */
+ (void)rctl_enforce(p, resource, amount);
+
racct_adjust_resource(p->p_racct, resource, amount);
racct_add_cred_locked(p->p_ucred, resource, amount);
+}
+
+/*
+ * Increase allocation of 'resource' by 'amount' for process 'p'.
+ * Checks for limits, but never fails.
+ */
+void
+racct_add_force(struct proc *p, int resource, uint64_t amount)
+{
+
+ if (!racct_enable)
+ return;
+
+ mtx_lock(&racct_lock);
+ racct_add_force_locked(p, resource, amount);
mtx_unlock(&racct_lock);
}
+void
+racct_add_buf(struct proc *p, const struct buf *bp, int is_write)
+{
+
+ ASSERT_RACCT_ENABLED();
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+
+ if (is_write) {
+ racct_add_force(curproc, RACCT_WRITEBPS, bp->b_bcount);
+ racct_add_force(curproc, RACCT_WRITEIOPS, 1);
+ } else {
+ racct_add_force(curproc, RACCT_READBPS, bp->b_bcount);
+ racct_add_force(curproc, RACCT_READIOPS, 1);
+ }
+}
+
static int
racct_set_locked(struct proc *p, int resource, uint64_t amount)
{
@@ -659,7 +707,7 @@
* The diffs may be negative.
*/
diff_proc = amount - old_amount;
- if (RACCT_IS_DECAYING(resource)) {
+ if (!racct_pctcpu_throttle_not_deny && RACCT_IS_DECAYING(resource)) {
/*
* Resources in per-credential racct containers may decay.
* If this is the case, we need to calculate the difference
@@ -688,7 +736,7 @@
racct_adjust_resource(p->p_racct, resource, diff_proc);
if (diff_cred > 0)
racct_add_cred_locked(p->p_ucred, resource, diff_cred);
- else if (diff_cred < 0)
+ else if (RACCT_IS_RECLAIMABLE(resource) && diff_cred < 0)
racct_sub_cred_locked(p->p_ucred, resource, -diff_cred);
return (0);
@@ -735,7 +783,7 @@
* The diffs may be negative.
*/
diff_proc = amount - old_amount;
- if (RACCT_IS_DECAYING(resource)) {
+ if (!racct_pctcpu_throttle_not_deny && RACCT_IS_DECAYING(resource)) {
/*
* Resources in per-credential racct containers may decay.
* If this is the case, we need to calculate the difference
@@ -750,7 +798,7 @@
racct_adjust_resource(p->p_racct, resource, diff_proc);
if (diff_cred > 0)
racct_add_cred_locked(p->p_ucred, resource, diff_cred);
- else if (diff_cred < 0)
+ else if (RACCT_IS_RECLAIMABLE(resource) && diff_cred < 0)
racct_sub_cred_locked(p->p_ucred, resource, -diff_cred);
}
@@ -850,7 +898,8 @@
(intmax_t)p->p_racct->r_resources[resource], p->p_comm, p->p_pid));
racct_adjust_resource(p->p_racct, resource, -amount);
- racct_sub_cred_locked(p->p_ucred, resource, amount);
+ if (RACCT_IS_RECLAIMABLE(resource))
+ racct_sub_cred_locked(p->p_ucred, resource, amount);
mtx_unlock(&racct_lock);
}
@@ -994,13 +1043,17 @@
#endif
microuptime(&wallclock);
timevalsub(&wallclock, &p->p_stats->p_start);
- if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) {
- pct_estimate = (1000000 * runtime * 100) /
- ((uint64_t)wallclock.tv_sec * 1000000 +
- wallclock.tv_usec);
- } else
- pct_estimate = 0;
- pct = racct_getpcpu(p, pct_estimate);
+ if (!racct_pctcpu_throttle_not_deny) {
+ if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) {
+ pct_estimate = (1000000 * runtime * 100) /
+ ((uint64_t)wallclock.tv_sec * 1000000 +
+ wallclock.tv_usec);
+ } else
+ pct_estimate = 0;
+ pct = racct_getpcpu(p, pct_estimate);
+ } else {
+ pct = (runtime - p->p_prev_runtime) * 100;
+ }
mtx_lock(&racct_lock);
racct_set_locked(p, RACCT_CPU, runtime);
@@ -1085,14 +1138,15 @@
mtx_unlock(&racct_lock);
}
-static void
-racct_proc_throttle(struct proc *p)
+void
+racct_proc_throttle(struct proc *p, int timeout)
{
struct thread *td;
#ifdef SMP
int cpuid;
#endif
+ KASSERT(timeout != 0, ("timeout %d", timeout));
ASSERT_RACCT_ENABLED();
PROC_LOCK_ASSERT(p, MA_OWNED);
@@ -1100,10 +1154,13 @@
* Do not block kernel processes. Also do not block processes with
* low %cpu utilization to improve interactivity.
*/
- if (((p->p_flag & (P_SYSTEM | P_KTHREAD)) != 0) ||
- (p->p_racct->r_resources[RACCT_PCTCPU] <= pcpu_threshold))
+ if ((p->p_flag & (P_SYSTEM | P_KTHREAD)) != 0)
+ return;
+
+ if (p->p_throttled < 0 || (timeout > 0 && p->p_throttled > timeout))
return;
- p->p_throttled = 1;
+
+ p->p_throttled = timeout;
FOREACH_THREAD_IN_PROC(p, td) {
thread_lock(td);
@@ -1144,30 +1201,37 @@
PROC_LOCK_ASSERT(p, MA_OWNED);
- if (p->p_throttled) {
+ if (p->p_throttled != 0) {
p->p_throttled = 0;
wakeup(p->p_racct);
}
}
static void
-racct_decay_resource(struct racct *racct, void * res, void* dummy)
+racct_decay_callback(struct racct *racct, void *res, void *dummy)
{
- int resource;
int64_t r_old, r_new;
ASSERT_RACCT_ENABLED();
mtx_assert(&racct_lock, MA_OWNED);
- resource = *(int *)res;
- r_old = racct->r_resources[resource];
+ rctl_throttle_decay(racct, RACCT_READBPS);
+ rctl_throttle_decay(racct, RACCT_WRITEBPS);
+ rctl_throttle_decay(racct, RACCT_READIOPS);
+ rctl_throttle_decay(racct, RACCT_WRITEIOPS);
+
+ if (racct_pctcpu_throttle_not_deny) {
+ rctl_throttle_decay(racct, RACCT_PCTCPU);
+ } else {
+ r_old = racct->r_resources[RACCT_PCTCPU];
+
+ /* If there is nothing to decay, just exit. */
+ if (r_old <= 0)
+ return;
- /* If there is nothing to decay, just exit. */
- if (r_old <= 0)
- return;
-
- r_new = r_old * RACCT_DECAY_FACTOR / FSCALE;
- racct->r_resources[resource] = r_new;
+ r_new = r_old * RACCT_DECAY_FACTOR / FSCALE;
+ racct->r_resources[RACCT_PCTCPU] = r_new;
+ }
}
static void
@@ -1185,17 +1249,17 @@
}
static void
-racct_decay(int resource)
+racct_decay(void)
{
ASSERT_RACCT_ENABLED();
- ui_racct_foreach(racct_decay_resource, racct_decay_pre,
- racct_decay_post, &resource, NULL);
- loginclass_racct_foreach(racct_decay_resource, racct_decay_pre,
- racct_decay_post, &resource, NULL);
- prison_racct_foreach(racct_decay_resource, racct_decay_pre,
- racct_decay_post, &resource, NULL);
+ ui_racct_foreach(racct_decay_callback, racct_decay_pre,
+ racct_decay_post, NULL, NULL);
+ loginclass_racct_foreach(racct_decay_callback, racct_decay_pre,
+ racct_decay_post, NULL, NULL);
+ prison_racct_foreach(racct_decay_callback, racct_decay_pre,
+ racct_decay_post, NULL, NULL);
}
static void
@@ -1206,18 +1270,22 @@
struct timeval wallclock;
uint64_t runtime;
uint64_t pct, pct_estimate;
+ bool throttle;
+ //int error;
ASSERT_RACCT_ENABLED();
for (;;) {
- racct_decay(RACCT_PCTCPU);
+ racct_decay();
sx_slock(&allproc_lock);
- LIST_FOREACH(p, &zombproc, p_list) {
- PROC_LOCK(p);
- racct_set(p, RACCT_PCTCPU, 0);
- PROC_UNLOCK(p);
+ if (!racct_pctcpu_throttle_not_deny) {
+ LIST_FOREACH(p, &zombproc, p_list) {
+ PROC_LOCK(p);
+ racct_set(p, RACCT_PCTCPU, 0);
+ PROC_UNLOCK(p);
+ }
}
FOREACH_PROC_IN_SYSTEM(p) {
@@ -1241,24 +1309,63 @@
if (runtime < p->p_prev_runtime)
runtime = p->p_prev_runtime;
#endif
- p->p_prev_runtime = runtime;
- if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) {
- pct_estimate = (1000000 * runtime * 100) /
- ((uint64_t)wallclock.tv_sec * 1000000 +
- wallclock.tv_usec);
- } else
- pct_estimate = 0;
- pct = racct_getpcpu(p, pct_estimate);
+ if (!racct_pctcpu_throttle_not_deny) {
+ if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) {
+ pct_estimate = (1000000 * runtime * 100) /
+ ((uint64_t)wallclock.tv_sec * 1000000 +
+ wallclock.tv_usec);
+ } else
+ pct_estimate = 0;
+ pct = racct_getpcpu(p, pct_estimate);
+ } else {
+ pct = (runtime - p->p_prev_runtime) * 100;
+ }
mtx_lock(&racct_lock);
- racct_set_force_locked(p, RACCT_PCTCPU, pct);
+
+ rctl_throttle_decay(p->p_racct, RACCT_READBPS);
+ rctl_throttle_decay(p->p_racct, RACCT_WRITEBPS);
+ rctl_throttle_decay(p->p_racct, RACCT_READIOPS);
+ rctl_throttle_decay(p->p_racct, RACCT_WRITEIOPS);
+ if (racct_pctcpu_throttle_not_deny) {
+ rctl_throttle_decay(p->p_racct, RACCT_PCTCPU);
+ /*
+ * Preferably we would bump the counters
+ * from statclock. We cannot, however - to call
+ * racct_add() we need the proc lock, and we
+ * cannot acquire that in statclock().
+ *
+ * Note that 1% basically means 10 milliseconds.
+ */
+ racct_add_force_locked(p, RACCT_PCTCPU, pct);
+ } else {
+ racct_set_force_locked(p, RACCT_PCTCPU, pct);
+ }
+ p->p_prev_runtime = runtime;
+
+#if 0
+ if (racct_pctcpu_throttle_not_deny) {
+ /*
+ * Ignore the return value; there cannot
+ * be a "deny" rule there.
+ */
+ error = rctl_enforce(p, RACCT_PCTCPU, 0);
+ KASSERT(error == 0,
+ ("rctl_enforce() returned %d", error));
+ }
+#endif
+
racct_set_locked(p, RACCT_CPU, runtime);
racct_set_locked(p, RACCT_WALLCLOCK,
(uint64_t)wallclock.tv_sec * 1000000 +
wallclock.tv_usec);
+
mtx_unlock(&racct_lock);
PROC_UNLOCK(p);
}
+ if (racct_pctcpu_throttle_not_deny)
+ goto done;
+
/*
* To ensure that processes are throttled in a fair way, we need
* to iterate over all processes again and check the limits
@@ -1272,12 +1379,18 @@
continue;
}
- if (racct_pcpu_available(p) <= 0)
- racct_proc_throttle(p);
- else if (p->p_throttled)
+ throttle = false;
+ if (racct_pcpu_available(p) <= 0 &&
+ (p->p_racct->r_resources[RACCT_PCTCPU] > pcpu_threshold))
+ throttle = true;
+
+ if (throttle)
+ racct_proc_throttle(p, -1);
+ else if (p->p_throttled == -1)
racct_proc_wakeup(p);
PROC_UNLOCK(p);
}
+done:
sx_sunlock(&allproc_lock);
pause("-", hz);
}
Index: sys/kern/kern_rctl.c
===================================================================
--- sys/kern/kern_rctl.c
+++ sys/kern/kern_rctl.c
@@ -77,11 +77,31 @@
#define RCTL_PCPU_SHIFT (10 * 1000000)
-unsigned int rctl_maxbufsize = RCTL_MAX_OUTBUFSIZE;
+static unsigned int rctl_maxbufsize = RCTL_MAX_OUTBUFSIZE;
+static int rctl_log_ratelimit = 10;
+static int rctl_devctl_ratelimit = 10;
+static unsigned int rctl_throttle_min = 0;
+static unsigned int rctl_throttle_max = 0;
+static unsigned int rctl_throttle_pct = 0;
+static unsigned int rctl_throttle_pct2 = 0;
SYSCTL_NODE(_kern_racct, OID_AUTO, rctl, CTLFLAG_RW, 0, "Resource Limits");
SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, maxbufsize, CTLFLAG_RWTUN,
&rctl_maxbufsize, 0, "Maximum output buffer size");
+SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, rctl_log_ratelimit, CTLFLAG_RW,
+ &rctl_log_ratelimit, 0, "Maximum number of log messages per second");
+SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, rctl_devctl_ratelimit, CTLFLAG_RW,
+ &rctl_devctl_ratelimit, 0, "Maximum number of devctl messages per second");
+SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, throttle_min, CTLFLAG_RDTUN,
+ &rctl_throttle_min, 0, "Shortest throttling duration, in hz");
+SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, throttle_max, CTLFLAG_RDTUN,
+ &rctl_throttle_max, 0, "Longest throttling duration, in hz");
+SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, throttle_pct, CTLFLAG_RDTUN,
+ &rctl_throttle_pct, 0,
+ "Throttling penalty for process consumption, in percent");
+SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, throttle_pct2, CTLFLAG_RDTUN,
+ &rctl_throttle_pct2, 0,
+ "Throttling penalty for container consumption, in percent");
/*
* 'rctl_rule_link' connects a rule with every racct it's related to.
@@ -128,6 +148,10 @@
{ "shmsize", RACCT_SHMSIZE },
{ "wallclock", RACCT_WALLCLOCK },
{ "pcpu", RACCT_PCTCPU },
+ { "readbps", RACCT_READBPS },
+ { "writebps", RACCT_WRITEBPS },
+ { "readiops", RACCT_READIOPS },
+ { "writeiops", RACCT_WRITEIOPS },
{ NULL, -1 }};
static struct dict actionnames[] = {
@@ -165,6 +189,7 @@
{ "deny", RCTL_ACTION_DENY },
{ "log", RCTL_ACTION_LOG },
{ "devctl", RCTL_ACTION_DEVCTL },
+ { "throttle", RCTL_ACTION_THROTTLE },
{ NULL, -1 }};
static void rctl_init(void);
@@ -219,66 +244,96 @@
panic("rctl_resource_name: unknown resource %d", resource);
}
-/*
- * Return the amount of resource that can be allocated by 'p' before
- * hitting 'rule'.
- */
-static int64_t
-rctl_available_resource(const struct proc *p, const struct rctl_rule *rule)
+static struct racct *
+rctl_proc_rule_to_racct(const struct proc *p, const struct rctl_rule *rule)
{
- int resource;
- int64_t available = INT64_MAX;
struct ucred *cred = p->p_ucred;
ASSERT_RACCT_ENABLED();
rw_assert(&rctl_lock, RA_LOCKED);
- resource = rule->rr_resource;
switch (rule->rr_per) {
case RCTL_SUBJECT_TYPE_PROCESS:
- available = rule->rr_amount -
- p->p_racct->r_resources[resource];
- break;
+ return (p->p_racct);
case RCTL_SUBJECT_TYPE_USER:
- available = rule->rr_amount -
- cred->cr_ruidinfo->ui_racct->r_resources[resource];
- break;
+ return (cred->cr_ruidinfo->ui_racct);
case RCTL_SUBJECT_TYPE_LOGINCLASS:
- available = rule->rr_amount -
- cred->cr_loginclass->lc_racct->r_resources[resource];
- break;
+ return (cred->cr_loginclass->lc_racct);
case RCTL_SUBJECT_TYPE_JAIL:
- available = rule->rr_amount -
- cred->cr_prison->pr_prison_racct->prr_racct->
- r_resources[resource];
- break;
+ return (cred->cr_prison->pr_prison_racct->prr_racct);
default:
- panic("rctl_compute_available: unknown per %d",
- rule->rr_per);
+ panic("%s: unknown per %d", __func__, rule->rr_per);
}
+}
+
+/*
+ * Return the amount of resource that can be allocated by 'p' before
+ * hitting 'rule'.
+ */
+static int64_t
+rctl_available_resource(const struct proc *p, const struct rctl_rule *rule)
+{
+ int64_t available = INT64_MAX;
+ const struct racct *racct;
+
+ ASSERT_RACCT_ENABLED();
+ rw_assert(&rctl_lock, RA_LOCKED);
+ mtx_assert(&racct_lock, MA_OWNED);
+
+ racct = rctl_proc_rule_to_racct(p, rule);
+ available = rule->rr_amount - racct->r_resources[rule->rr_resource];
return (available);
}
/*
- * Return non-zero if allocating 'amount' by proc 'p' would exceed
- * resource limit specified by 'rule'.
+ * Called every second for proc, uidinfo, loginclass, and jail containers.
+ * If the limit wasn't exceeded, it decreases the usage amount to zero.
+ * Otherwise, it decreases it by the value of the limit. This way
+ * resource consumption exceeding the limit "carries over" to the next
+ * period.
*/
-static int
-rctl_would_exceed(const struct proc *p, const struct rctl_rule *rule,
- int64_t amount)
+void
+rctl_throttle_decay(struct racct *racct, int resource)
{
- int64_t available;
+ struct rctl_rule *rule;
+ struct rctl_rule_link *link;
+ int64_t minavailable;
- ASSERT_RACCT_ENABLED();
+ mtx_assert(&racct_lock, MA_OWNED);
- rw_assert(&rctl_lock, RA_LOCKED);
+ minavailable = INT64_MAX;
- available = rctl_available_resource(p, rule);
- if (available >= amount)
- return (0);
+ rw_rlock(&rctl_lock);
- return (1);
+ LIST_FOREACH(link, &racct->r_rule_links, rrl_next) {
+ rule = link->rrl_rule;
+
+ if (rule->rr_resource != resource)
+ continue;
+ if (rule->rr_action != RCTL_ACTION_THROTTLE)
+ continue;
+
+ if (rule->rr_amount < minavailable)
+ minavailable = rule->rr_amount;
+ }
+
+ rw_runlock(&rctl_lock);
+
+ if (racct->r_resources[resource] < minavailable) {
+ racct->r_resources[resource] = 0;
+ } else {
+ /*
+ * Cap utilization counter at ten times the limit. Otherwise,
+ * if we accumulated the resource before adding the limiting
+ * rule, it could take unreasonably long time for the counter
+ * value to drop.
+ */
+ if (racct->r_resources[resource] > minavailable * 10)
+ racct->r_resources[resource] = minavailable * 10;
+ else
+ racct->r_resources[resource] -= minavailable;
+ }
}
/*
@@ -328,6 +383,38 @@
return (minavailable);
}
+static uint64_t
+xadd(uint64_t a, uint64_t b)
+{
+ uint64_t c;
+
+ c = a + b;
+
+ /*
+ * Detect overflow.
+ */
+ if (c < a || c < b)
+ return (UINT64_MAX);
+
+ return (c);
+}
+
+static uint64_t
+xmul(uint64_t a, uint64_t b)
+{
+ uint64_t c;
+
+ if (a == 0 || b == 0)
+ return (0);
+
+ c = a * b;
+
+ if (c < a || c < b)
+ return (UINT64_MAX);
+
+ return (c);
+}
+
/*
* Check whether the proc 'p' can allocate 'amount' of 'resource' in addition
* to what it keeps allocated now. Returns non-zero if the allocation should
@@ -339,12 +426,16 @@
struct rctl_rule *rule;
struct rctl_rule_link *link;
struct sbuf sb;
+ int64_t available;
+ uint64_t sleep_ms, sleep_ratio;
int should_deny = 0;
char *buf;
- static int curtime = 0;
- static struct timeval lasttime;
+
+ static int log_curtime = 0, devctl_curtime = 0;
+ static struct timeval log_lasttime, devctl_lasttime;
ASSERT_RACCT_ENABLED();
+ mtx_assert(&racct_lock, MA_OWNED);
rw_rlock(&rctl_lock);
@@ -356,7 +447,9 @@
rule = link->rrl_rule;
if (rule->rr_resource != resource)
continue;
- if (!rctl_would_exceed(p, rule, amount)) {
+
+ available = rctl_available_resource(p, rule);
+ if (available >= (int64_t)amount) {
link->rrl_exceeded = 0;
continue;
}
@@ -383,7 +476,8 @@
if (p->p_state != PRS_NORMAL)
continue;
- if (!ppsratecheck(&lasttime, &curtime, 10))
+ if (!ppsratecheck(&log_lasttime, &log_curtime,
+ rctl_log_ratelimit))
continue;
buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT);
@@ -408,7 +502,11 @@
if (p->p_state != PRS_NORMAL)
continue;
-
+
+ if (!ppsratecheck(&devctl_lasttime, &devctl_curtime,
+ rctl_devctl_ratelimit))
+ continue;
+
buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT);
if (buf == NULL) {
printf("rctl_enforce: out of memory\n");
@@ -427,6 +525,69 @@
free(buf, M_RCTL);
link->rrl_exceeded = 1;
continue;
+ case RCTL_ACTION_THROTTLE:
+ if (p->p_state != PRS_NORMAL)
+ continue;
+
+ /*
+ * Make the process sleep for a fraction of second
+ * proportional to the ratio of process' resource
+ * utilization compared to the limit. The point is
+ * to penalize resource hogs: processes that consume
+ * more of the available resources sleep for longer.
+ *
+ * We're trying to defer division until the very end,
+ * to minimize the rounding effects. The following
+ * calculation could have been written in a clearer
+ * way like this:
+ *
+ * sleep_ms = hz * p->p_racct->r_resources[resource] /
+ * rule->rr_amount;
+ * sleep_ms *= rctl_throttle_pct / 100;
+ * if (sleep_ms < rctl_throttle_min)
+ * sleep_ms = rctl_throttle_min;
+ *
+ */
+ sleep_ms = xmul(hz, p->p_racct->r_resources[resource]);
+ sleep_ms = xmul(sleep_ms, rctl_throttle_pct) / 100;
+ if (sleep_ms < rctl_throttle_min * rule->rr_amount)
+ sleep_ms = rctl_throttle_min * rule->rr_amount;
+
+ /*
+ * Multiply that by the ratio of the resource
+ * consumption for the container compared to the limit,
+ * squared. In other words, a process in a container
+ * that is two times over the limit will be throttled
+ * four times as much for hitting the same rule. The
+ * point is to penalize processes more if the container
+ * itself (eg certain UID or jail) is above the limit.
+ */
+ if (available < 0)
+ sleep_ratio = -available / rule->rr_amount;
+ else
+ sleep_ratio = 0;
+ sleep_ratio = xmul(sleep_ratio, sleep_ratio);
+ sleep_ratio = xmul(sleep_ratio, rctl_throttle_pct2) / 100;
+ sleep_ms = xadd(sleep_ms, xmul(sleep_ms, sleep_ratio));
+
+ /*
+ * Finally the division.
+ */
+ sleep_ms /= rule->rr_amount;
+
+ if (sleep_ms > rctl_throttle_max)
+ sleep_ms = rctl_throttle_max;
+#if 0
+ printf("%s: pid %d (%s), %jd of %jd, will sleep for %ld ms (ratio %ld, available %ld)\n",
+ __func__, p->p_pid, p->p_comm,
+ p->p_racct->r_resources[resource],
+ rule->rr_amount, sleep_ms, sleep_ratio, available);
+#endif
+
+ KASSERT(sleep_ms >= rctl_throttle_min, ("%s: %ld < %d\n",
+ __func__, sleep_ms, rctl_throttle_min));
+ racct_proc_throttle(p, sleep_ms);
+ continue;
default:
if (link->rrl_exceeded != 0)
continue;
@@ -642,6 +803,9 @@
if ((size_t)(end - str) != strlen(str))
return (EINVAL);
+ if (*value <= 0)
+ return (ERANGE);
+
return (0);
}
@@ -1008,8 +1172,13 @@
error = str2int64(amountstr, &rule->rr_amount);
if (error != 0)
goto out;
- if (RACCT_IS_IN_MILLIONS(rule->rr_resource))
+ if (RACCT_IS_IN_MILLIONS(rule->rr_resource)) {
+ if (rule->rr_amount > INT64_MAX / 1000000) {
+ error = ERANGE;
+ goto out;
+ }
rule->rr_amount *= 1000000;
+ }
}
if (perstr == NULL || perstr[0] == '\0')
@@ -1048,20 +1217,27 @@
KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
/*
- * Some rules just don't make sense. Note that the one below
- * cannot be rewritten using RACCT_IS_DENIABLE(); the RACCT_PCTCPU,
- * for example, is not deniable in the racct sense, but the
- * limit is enforced in a different way, so "deny" rules for %CPU
- * do make sense.
+ * Some rules just don't make sense, like "deny" rule for an undeniable
+ * resource. The exception are the RSS and %CPU resources - they are
+ * not deniable in the racct sense, but the limit is enforced in
+ * a different way.
*/
if (rule->rr_action == RCTL_ACTION_DENY &&
- (rule->rr_resource == RACCT_CPU ||
- rule->rr_resource == RACCT_WALLCLOCK))
+ !RACCT_IS_DENIABLE(rule->rr_resource) &&
+ rule->rr_resource != RACCT_RSS &&
+ rule->rr_resource != RACCT_PCTCPU) {
return (EOPNOTSUPP);
+ }
+
+ if (rule->rr_action == RCTL_ACTION_THROTTLE &&
+ !RACCT_IS_DECAYING(rule->rr_resource)) {
+ return (EOPNOTSUPP);
+ }
if (rule->rr_per == RCTL_SUBJECT_TYPE_PROCESS &&
- RACCT_IS_SLOPPY(rule->rr_resource))
+ RACCT_IS_SLOPPY(rule->rr_resource)) {
return (EOPNOTSUPP);
+ }
/*
* Make sure there are no duplicated rules. Also, for the "deny"
@@ -1346,7 +1522,9 @@
for (i = 0; i <= RACCT_MAX; i++) {
if (sloppy == 0 && RACCT_IS_SLOPPY(i))
continue;
+ mtx_lock(&racct_lock);
amount = racct->r_resources[i];
+ mtx_unlock(&racct_lock);
if (RACCT_IS_IN_MILLIONS(i))
amount /= 1000000;
sbuf_printf(sb, "%s=%jd,", rctl_resource_name(i), amount);
@@ -1619,6 +1797,81 @@
return (error);
}
+static void
+rctl_rule_find_callback(struct racct *racct, void *arg2, void *arg3)
+{
+ struct rctl_rule *filter = (struct rctl_rule *)arg2;
+ struct rctl_rule_link *link;
+ int found = 0;
+
+ ASSERT_RACCT_ENABLED();
+ rw_assert(&rctl_lock, RA_LOCKED);
+
+ LIST_FOREACH(link, &racct->r_rule_links, rrl_next) {
+ if (!rctl_rule_matches(link->rrl_rule, filter))
+ continue;
+ found++;
+ }
+
+ *((int *)arg3) += found;
+}
+
+/*
+ * Try to set racct_pctcpu_throttle_not_deny to the value given.
+ * It can only be set to '1' if there are no 'pcpu:deny' rules,
+ * and to '0' only if there are no 'pcpu:throttle" rules.
+ */
+static int
+rctl_set_pctcpu_throttle_not_deny(int val)
+{
+ struct rctl_rule *filter;
+ struct proc *p;
+ int found;
+
+ if (racct_pctcpu_throttle_not_deny == val)
+ return (0);
+
+ /*
+ * The purpose of this lock here is not only to be able to iterate
+ * over processes, but also to avoid race with rctl_rule_add()
+ * adding a clashing rule after we do the search, but before we set
+ * the value. Besides, rctl_rule_add() holds this lock anyway.
+ */
+ sx_assert(&allproc_lock, SA_LOCKED);
+
+ filter = rctl_rule_alloc(M_WAITOK);
+ filter->rr_resource = RACCT_PCTCPU;
+ if (val == 1)
+ filter->rr_action = RCTL_ACTION_DENY;
+ else
+ filter->rr_action = RCTL_ACTION_THROTTLE;
+
+ found = 0;
+
+ rctl_rule_pre_callback();
+ FOREACH_PROC_IN_SYSTEM(p)
+ rctl_rule_find_callback(p->p_racct, filter, &found);
+ rctl_rule_post_callback();
+
+ loginclass_racct_foreach(rctl_rule_find_callback,
+ rctl_rule_pre_callback, rctl_rule_post_callback,
+ filter, &found);
+ ui_racct_foreach(rctl_rule_find_callback,
+ rctl_rule_pre_callback, rctl_rule_post_callback,
+ filter, &found);
+ prison_racct_foreach(rctl_rule_find_callback,
+ rctl_rule_pre_callback, rctl_rule_post_callback,
+ filter, &found);
+
+ rctl_rule_release(filter);
+
+ if (found != 0)
+ return (EOPNOTSUPP);
+
+ racct_pctcpu_throttle_not_deny = val;
+ return (0);
+}
+
int
sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap)
{
@@ -1656,6 +1909,19 @@
goto out;
}
+ if (rule->rr_resource == RACCT_PCTCPU) {
+ if (rule->rr_action == RCTL_ACTION_DENY) {
+ error = rctl_set_pctcpu_throttle_not_deny(0);
+ if (error != 0)
+ goto out;
+ }
+ if (rule->rr_action == RCTL_ACTION_THROTTLE) {
+ error = rctl_set_pctcpu_throttle_not_deny(1);
+ if (error != 0)
+ goto out;
+ }
+ }
+
error = rctl_rule_add(rule);
out:
@@ -1935,6 +2201,15 @@
UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
rctl_rule_zone = uma_zcreate("rctl_rule", sizeof(struct rctl_rule),
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+
+ if (rctl_throttle_min <= 0)
+ rctl_throttle_min = 1;
+ if (rctl_throttle_max <= 0)
+ rctl_throttle_max = 2 * hz;
+ if (rctl_throttle_pct <= 0)
+ rctl_throttle_pct = 100;
+ if (rctl_throttle_pct2 <= 0)
+ rctl_throttle_pct2 = 100;
}
#else /* !RCTL */
Index: sys/kern/subr_trap.c
===================================================================
--- sys/kern/subr_trap.c
+++ sys/kern/subr_trap.c
@@ -172,10 +172,14 @@
(td->td_vnet_lpush != NULL) ? td->td_vnet_lpush : "N/A"));
#endif
#ifdef RACCT
- if (racct_enable && p->p_throttled == 1) {
+ if (racct_enable && p->p_throttled != 0) {
PROC_LOCK(p);
- while (p->p_throttled == 1)
- msleep(p->p_racct, &p->p_mtx, 0, "racct", 0);
+ while (p->p_throttled != 0) {
+ msleep(p->p_racct, &p->p_mtx, 0, "racct",
+ p->p_throttled < 0 ? 0 : p->p_throttled);
+ if (p->p_throttled > 0)
+ p->p_throttled = 0;
+ }
PROC_UNLOCK(p);
}
#endif
Index: sys/kern/vfs_bio.c
===================================================================
--- sys/kern/vfs_bio.c
+++ sys/kern/vfs_bio.c
@@ -61,6 +61,7 @@
#include <sys/kernel.h>
#include <sys/kthread.h>
#include <sys/proc.h>
+#include <sys/racct.h>
#include <sys/resourcevar.h>
#include <sys/rwlock.h>
#include <sys/smp.h>
@@ -1784,8 +1785,16 @@
rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0);
if ((rabp->b_flags & B_CACHE) == 0) {
- if (!TD_IS_IDLETHREAD(curthread))
+ if (!TD_IS_IDLETHREAD(curthread)) {
+#ifdef RACCT
+ if (racct_enable) {
+ PROC_LOCK(curproc);
+ racct_add_buf(curproc, rabp, 0);
+ PROC_UNLOCK(curproc);
+ }
+#endif /* RACCT */
curthread->td_ru.ru_inblock++;
+ }
rabp->b_flags |= B_ASYNC;
rabp->b_flags &= ~B_INVAL;
rabp->b_ioflags &= ~BIO_ERROR;
@@ -1827,8 +1836,16 @@
/* if not found in cache, do some I/O */
if ((bp->b_flags & B_CACHE) == 0) {
- if (!TD_IS_IDLETHREAD(curthread))
+ if (!TD_IS_IDLETHREAD(curthread)) {
+#ifdef RACCT
+ if (racct_enable) {
+ PROC_LOCK(curproc);
+ racct_add_buf(curproc, bp, 0);
+ PROC_UNLOCK(curproc);
+ }
+#endif /* RACCT */
curthread->td_ru.ru_inblock++;
+ }
bp->b_iocmd = BIO_READ;
bp->b_flags &= ~B_INVAL;
bp->b_ioflags &= ~BIO_ERROR;
@@ -1920,8 +1937,16 @@
bp->b_runningbufspace = bp->b_bufsize;
space = atomic_fetchadd_long(&runningbufspace, bp->b_runningbufspace);
- if (!TD_IS_IDLETHREAD(curthread))
+ if (!TD_IS_IDLETHREAD(curthread)) {
+#ifdef RACCT
+ if (racct_enable) {
+ PROC_LOCK(curproc);
+ racct_add_buf(curproc, bp, 1);
+ PROC_UNLOCK(curproc);
+ }
+#endif /* RACCT */
curthread->td_ru.ru_oublock++;
+ }
if (oldflags & B_ASYNC)
BUF_KERNPROC(bp);
bp->b_iooffset = dbtob(bp->b_blkno);
Index: sys/kern/vfs_cluster.c
===================================================================
--- sys/kern/vfs_cluster.c
+++ sys/kern/vfs_cluster.c
@@ -45,6 +45,7 @@
#include <sys/vnode.h>
#include <sys/malloc.h>
#include <sys/mount.h>
+#include <sys/racct.h>
#include <sys/resourcevar.h>
#include <sys/rwlock.h>
#include <sys/vmmeter.h>
@@ -239,6 +240,13 @@
BUF_KERNPROC(bp);
bp->b_iooffset = dbtob(bp->b_blkno);
bstrategy(bp);
+#ifdef RACCT
+ if (racct_enable) {
+ PROC_LOCK(curproc);
+ racct_add_buf(curproc, bp, 0);
+ PROC_UNLOCK(curproc);
+ }
+#endif /* RACCT */
curthread->td_ru.ru_inblock++;
}
@@ -292,6 +300,13 @@
BUF_KERNPROC(rbp);
rbp->b_iooffset = dbtob(rbp->b_blkno);
bstrategy(rbp);
+#ifdef RACCT
+ if (racct_enable) {
+ PROC_LOCK(curproc);
+ racct_add_buf(curproc, rbp, 0);
+ PROC_UNLOCK(curproc);
+ }
+#endif /* RACCT */
curthread->td_ru.ru_inblock++;
}
Index: sys/sys/proc.h
===================================================================
--- sys/sys/proc.h
+++ sys/sys/proc.h
@@ -620,7 +620,7 @@
after fork. */
uint64_t p_prev_runtime; /* (c) Resource usage accounting. */
struct racct *p_racct; /* (b) Resource accounting. */
- u_char p_throttled; /* (c) Flag for racct pcpu throttling */
+ int p_throttled; /* (c) Flag for racct pcpu throttling */
struct vm_domain_policy p_vm_dom_policy; /* (c) process default VM domain, or -1 */
/*
* An orphan is the child that has beed re-parented to the
Index: sys/sys/racct.h
===================================================================
--- sys/sys/racct.h
+++ sys/sys/racct.h
@@ -42,6 +42,7 @@
#include <sys/stdint.h>
#include <sys/sysctl.h>
+struct buf;
struct proc;
struct rctl_rule_link;
struct ucred;
@@ -71,7 +72,11 @@
#define RACCT_SHMSIZE 18
#define RACCT_WALLCLOCK 19
#define RACCT_PCTCPU 20
-#define RACCT_MAX RACCT_PCTCPU
+#define RACCT_READBPS 21
+#define RACCT_WRITEBPS 22
+#define RACCT_READIOPS 23
+#define RACCT_WRITEIOPS 24
+#define RACCT_MAX RACCT_WRITEIOPS
/*
* Resource properties.
@@ -85,6 +90,7 @@
extern int racct_types[];
extern int racct_enable;
+extern int racct_pctcpu_throttle_not_deny;
#define ASSERT_RACCT_ENABLED() KASSERT(racct_enable, \
("%s called with !racct_enable", __func__))
@@ -98,7 +104,7 @@
/*
* Resource usage can drop, as opposed to only grow. When the process
- * terminates, its resource usage is freed from the respective
+ * terminates, its resource usage is subtracted from the respective
* per-credential racct containers.
*/
#define RACCT_IS_RECLAIMABLE(X) (racct_types[X] & RACCT_RECLAIMABLE)
@@ -126,8 +132,7 @@
* When a process terminates, its resource usage is not automatically
* subtracted from per-credential racct containers. Instead, the resource
* usage of per-credential racct containers decays in time.
- * Resource usage can olso drop for such resource.
- * So far, the only such resource is RACCT_PCTCPU.
+ * Resource usage can also drop for such resource.
*/
#define RACCT_IS_DECAYING(X) (racct_types[X] & RACCT_DECAYING)
@@ -149,11 +154,14 @@
SYSCTL_DECL(_kern_racct);
+extern struct mtx racct_lock;
+
#ifdef RACCT
int racct_add(struct proc *p, int resource, uint64_t amount);
void racct_add_cred(struct ucred *cred, int resource, uint64_t amount);
void racct_add_force(struct proc *p, int resource, uint64_t amount);
+void racct_add_buf(struct proc *p, const struct buf *bufp, int is_write);
int racct_set(struct proc *p, int resource, uint64_t amount);
void racct_set_force(struct proc *p, int resource, uint64_t amount);
void racct_sub(struct proc *p, int resource, uint64_t amount);
@@ -171,6 +179,7 @@
void racct_proc_ucred_changed(struct proc *p, struct ucred *oldcred,
struct ucred *newcred);
void racct_move(struct racct *dest, struct racct *src);
+void racct_proc_throttle(struct proc *p, int timeout);
#else
Index: sys/sys/rctl.h
===================================================================
--- sys/sys/rctl.h
+++ sys/sys/rctl.h
@@ -129,7 +129,8 @@
#define RCTL_ACTION_DENY (RCTL_ACTION_SIGNAL_MAX + 1)
#define RCTL_ACTION_LOG (RCTL_ACTION_SIGNAL_MAX + 2)
#define RCTL_ACTION_DEVCTL (RCTL_ACTION_SIGNAL_MAX + 3)
-#define RCTL_ACTION_MAX RCTL_ACTION_DEVCTL
+#define RCTL_ACTION_THROTTLE (RCTL_ACTION_SIGNAL_MAX + 4)
+#define RCTL_ACTION_MAX RCTL_ACTION_THROTTLE
#define RCTL_AMOUNT_UNDEFINED -1
@@ -140,6 +141,7 @@
int rctl_rule_add(struct rctl_rule *rule);
int rctl_rule_remove(struct rctl_rule *filter);
int rctl_enforce(struct proc *p, int resource, uint64_t amount);
+void rctl_throttle_decay(struct racct *racct, int resource);
int64_t rctl_pcpu_available(const struct proc *p);
uint64_t rctl_get_limit(struct proc *p, int resource);
uint64_t rctl_get_available(struct proc *p, int resource);
Index: sys/ufs/ffs/ffs_inode.c
===================================================================
--- sys/ufs/ffs/ffs_inode.c
+++ sys/ufs/ffs/ffs_inode.c
@@ -41,6 +41,7 @@
#include <sys/malloc.h>
#include <sys/mount.h>
#include <sys/proc.h>
+#include <sys/racct.h>
#include <sys/random.h>
#include <sys/resourcevar.h>
#include <sys/rwlock.h>
@@ -618,6 +619,13 @@
vp = ITOV(ip);
bp = getblk(vp, lbn, (int)fs->fs_bsize, 0, 0, 0);
if ((bp->b_flags & B_CACHE) == 0) {
+#ifdef RACCT
+ if (racct_enable) {
+ PROC_LOCK(curproc);
+ racct_add_buf(curproc, bp, 0);
+ PROC_UNLOCK(curproc);
+ }
+#endif /* RACCT */
curthread->td_ru.ru_inblock++; /* pay for read */
bp->b_iocmd = BIO_READ;
bp->b_flags &= ~B_INVAL;
Index: sys/ufs/ffs/ffs_softdep.c
===================================================================
--- sys/ufs/ffs/ffs_softdep.c
+++ sys/ufs/ffs/ffs_softdep.c
@@ -69,6 +69,7 @@
#include <sys/namei.h>
#include <sys/priv.h>
#include <sys/proc.h>
+#include <sys/racct.h>
#include <sys/rwlock.h>
#include <sys/stat.h>
#include <sys/sysctl.h>
@@ -6229,6 +6230,13 @@
vfs_busy_pages(bp, 0);
bp->b_iooffset = dbtob(bp->b_blkno);
bstrategy(bp);
+#ifdef RACCT
+ if (racct_enable) {
+ PROC_LOCK(curproc);
+ racct_add_buf(curproc, bp, 0);
+ PROC_UNLOCK(curproc);
+ }
+#endif /* RACCT */
curthread->td_ru.ru_inblock++;
error = bufwait(bp);
if (error) {
Index: sys/ufs/ufs/ufs_bmap.c
===================================================================
--- sys/ufs/ufs/ufs_bmap.c
+++ sys/ufs/ufs/ufs_bmap.c
@@ -44,6 +44,7 @@
#include <sys/proc.h>
#include <sys/vnode.h>
#include <sys/mount.h>
+#include <sys/racct.h>
#include <sys/resourcevar.h>
#include <sys/stat.h>
@@ -223,6 +224,13 @@
vfs_busy_pages(bp, 0);
bp->b_iooffset = dbtob(bp->b_blkno);
bstrategy(bp);
+#ifdef RACCT
+ if (racct_enable) {
+ PROC_LOCK(curproc);
+ racct_add_buf(curproc, bp, 0);
+ PROC_UNLOCK(curproc);
+ }
+#endif /* RACCT */
curthread->td_ru.ru_inblock++;
error = bufwait(bp);
if (error) {
Index: sys/vm/vm_fault.c
===================================================================
--- sys/vm/vm_fault.c
+++ sys/vm/vm_fault.c
@@ -83,6 +83,7 @@
#include <sys/lock.h>
#include <sys/mman.h>
#include <sys/proc.h>
+#include <sys/racct.h>
#include <sys/resourcevar.h>
#include <sys/rwlock.h>
#include <sys/sysctl.h>
@@ -994,6 +995,21 @@
if (hardfault) {
PCPU_INC(cnt.v_io_faults);
curthread->td_ru.ru_majflt++;
+#ifdef RACCT
+ if (racct_enable && fs.object->type == OBJT_VNODE) {
+ PROC_LOCK(curproc);
+ if ((fault_type & (VM_PROT_COPY | VM_PROT_WRITE)) != 0) {
+ racct_add_force(curproc, RACCT_WRITEBPS,
+ PAGE_SIZE + behind * PAGE_SIZE);
+ racct_add_force(curproc, RACCT_WRITEIOPS, 1);
+ } else {
+ racct_add_force(curproc, RACCT_READBPS,
+ PAGE_SIZE + ahead * PAGE_SIZE);
+ racct_add_force(curproc, RACCT_READIOPS, 1);
+ }
+ PROC_UNLOCK(curproc);
+ }
+#endif
} else
curthread->td_ru.ru_minflt++;
Index: usr.bin/rctl/rctl.8
===================================================================
--- usr.bin/rctl/rctl.8
+++ usr.bin/rctl/rctl.8
@@ -204,14 +204,22 @@
.It Sy shmsize Ta "SysV shared memory size, in bytes"
.It Sy wallclock Ta "wallclock time, in seconds"
.It Sy pcpu Ta "%CPU, in percents of a single CPU core"
+.It Sy readbps Ta "filesystem reads, in bytes per second"
+.It Sy writebps Ta "filesystem writes, in bytes per second"
+.It Sy readiops Ta "filesystem reads, in operations per second"
+.It Sy writeiops Ta "filesystem writes, in operations per second"
.El
.Sh ACTIONS
.Bl -column -offset 3n "pseudoterminals"
.It Em action
.It Sy deny Ta deny the allocation; not supported for
-.Sy cputime
+.Sy cputime ,
+.Sy wallclock ,
+.Sy readbps ,
+.Sy writebps ,
+.Sy readiops ,
and
-.Sy wallclock
+.Sy writeiops
.It Sy log Ta "log a warning to the console"
.It Sy devctl Ta "send notification to"
.Xr devd 8
@@ -228,6 +236,19 @@
See
.Xr signal 3
for a list of supported signals
+.It Sy throttle Ta "slow down process execution"; only supported for
+.Sy readbps ,
+.Sy writebps ,
+.Sy readiops ,
+and
+.Sy writeiops .
+Note that one cannot have both
+.Sy throttle
+and
+.Sy deny
+rules for the
+.Sy pcpu
+resource at the same time.
.El
.Pp
Not all actions are supported for all resources.
@@ -287,3 +308,21 @@
Limiting
.Sy memoryuse
may kill the machine due to thrashing.
+.Pp
+The
+.Sy readiops
+and
+.Sy writeiops
+counters are only approximations; just like the
+.Sy readbps
+and
+.Sy writebps ,
+they are accounted for in the filesystem layer, where it's hard
+or even impossible to observe actual disk device operations.
+.Pp
+The
+.Sy writebps
+and
+.Sy writeiops
+resources generally account for writes to the filesystem cache,
+not to actual devices.
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Wed, Apr 22, 2:49 AM (15 h, 15 s)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
31948526
Default Alt Text
D5080.id12715.diff (43 KB)
Attached To
Mode
D5080: RCTL disk io limits.
Attached
Detach File
Event Timeline
Log In to Comment