Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c @@ -132,6 +132,7 @@ #include #ifdef _KERNEL #include +#include #endif #include #include @@ -4503,6 +4504,18 @@ demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, misses); #ifdef _KERNEL +#ifdef RACCT + if (racct_enable) { +#if 0 + printf("%s: adding %jd bytes for %d (%s)\n", + __func__, (uintmax_t)size, curproc->p_pid, curproc->p_comm); +#endif + PROC_LOCK(curproc); + racct_add_force(curproc, RACCT_READBPS, size); + racct_add_force(curproc, RACCT_READIOPS, 1); + PROC_UNLOCK(curproc); + } +#endif /* RACCT */ curthread->td_ru.ru_inblock++; #endif Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c @@ -47,6 +47,7 @@ #include #include #ifdef _KERNEL +#include #include #include #endif @@ -427,6 +428,15 @@ } dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP); +#if defined(_KERNEL) && defined(RACCT) + if (racct_enable && !read) { + PROC_LOCK(curproc); + racct_add_force(curproc, RACCT_WRITEBPS, length); + racct_add_force(curproc, RACCT_WRITEIOPS, nblks); + PROC_UNLOCK(curproc); + } +#endif + zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); blkid = dbuf_whichblock(dn, 0, offset); for (i = 0; i < nblks; i++) { @@ -1422,7 +1432,15 @@ DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA) { #ifdef _KERNEL curthread->td_ru.ru_oublock++; -#endif +#ifdef RACCT + if (racct_enable) { + PROC_LOCK(curproc); + racct_add_force(curproc, RACCT_WRITEBPS, blksz); + racct_add_force(curproc, RACCT_WRITEIOPS, 1); + PROC_UNLOCK(curproc); + } +#endif /* RACCT */ +#endif /* _KERNEL */ dbuf_assign_arcbuf(db, buf, tx); dbuf_rele(db, FTAG); } else { Index: sys/fs/ext2fs/ext2_bmap.c =================================================================== --- sys/fs/ext2fs/ext2_bmap.c +++ sys/fs/ext2fs/ext2_bmap.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include @@ -247,6 +248,13 @@ vfs_busy_pages(bp, 0); bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); +#ifdef RACCT + if (racct_enable) { + PROC_LOCK(curproc); + racct_add_buf(curproc, bp, 0); + PROC_UNLOCK(curproc); + } +#endif curthread->td_ru.ru_inblock++; error = bufwait(bp); if (error) { Index: sys/kern/kern_physio.c =================================================================== --- sys/kern/kern_physio.c +++ sys/kern/kern_physio.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -109,6 +110,22 @@ prot |= VM_PROT_WRITE; /* Less backwards than it looks */ error = 0; for (i = 0; i < uio->uio_iovcnt; i++) { +#ifdef RACCT + if (racct_enable) { + PROC_LOCK(curproc); + if (uio->uio_rw == UIO_READ) { + racct_add_force(curproc, RACCT_READBPS, + uio->uio_iov[i].iov_len); + racct_add_force(curproc, RACCT_READIOPS, 1); + } else { + racct_add_force(curproc, RACCT_WRITEBPS, + uio->uio_iov[i].iov_len); + racct_add_force(curproc, RACCT_WRITEIOPS, 1); + } + PROC_UNLOCK(curproc); + } +#endif /* RACCT */ + while (uio->uio_iov[i].iov_len) { g_reset_bio(bp); if (uio->uio_rw == UIO_READ) { Index: sys/kern/kern_racct.c =================================================================== --- sys/kern/kern_racct.c +++ sys/kern/kern_racct.c @@ -35,6 +35,7 @@ #include "opt_sched.h" #include +#include #include #include #include @@ -66,21 +67,25 @@ FEATURE(racct, "Resource Accounting"); -/* - * Do not block processes that have their %cpu usage <= pcpu_threshold. - */ -static int pcpu_threshold = 1; #ifdef RACCT_DEFAULT_TO_DISABLED int racct_enable = 0; #else int racct_enable = 1; #endif +/* + * Do not block processes that have their %cpu usage <= pcpu_threshold. + */ +static int pcpu_threshold = 1; +int racct_pctcpu_throttle_not_deny = 0; SYSCTL_NODE(_kern, OID_AUTO, racct, CTLFLAG_RW, 0, "Resource Accounting"); SYSCTL_UINT(_kern_racct, OID_AUTO, enable, CTLFLAG_RDTUN, &racct_enable, 0, "Enable RACCT/RCTL"); SYSCTL_UINT(_kern_racct, OID_AUTO, pcpu_threshold, CTLFLAG_RW, &pcpu_threshold, - 0, "Processes with higher %cpu usage than this value can be throttled."); + 0, "\"pcpu:deny\" rules don't affect processes with %cpu below this value"); +SYSCTL_UINT(_kern_racct, OID_AUTO, pcpu_throttle_not_deny, CTLFLAG_RD, + &racct_pctcpu_throttle_not_deny, + 0, "Use \"pcpu:throttle\" rules instead of \"pcpu:deny\""); /* * How many seconds it takes to use the scheduler %cpu calculations. When a @@ -90,7 +95,7 @@ */ #define RACCT_PCPU_SECS 3 -static struct mtx racct_lock; +struct mtx racct_lock; MTX_SYSINIT(racct_lock, &racct_lock, "racct lock", MTX_DEF); static uma_zone_t racct_zone; @@ -171,7 +176,16 @@ [RACCT_WALLCLOCK] = RACCT_IN_MILLIONS, [RACCT_PCTCPU] = - RACCT_DECAYING | RACCT_DENIABLE | RACCT_IN_MILLIONS }; + RACCT_RECLAIMABLE | RACCT_DECAYING | RACCT_DENIABLE | + RACCT_IN_MILLIONS, + [RACCT_READBPS] = + RACCT_DECAYING, + [RACCT_WRITEBPS] = + RACCT_DECAYING, + [RACCT_READIOPS] = + RACCT_DECAYING, + [RACCT_WRITEIOPS] = + RACCT_DECAYING }; static const fixpt_t RACCT_DECAY_FACTOR = 0.3 * FSCALE; @@ -473,6 +487,8 @@ for (i = 0; i <= RACCT_MAX; i++) { if (RACCT_IS_SLOPPY(i)) continue; + if (RACCT_IS_DECAYING(i)) + continue; if (!RACCT_IS_RECLAIMABLE(i)) continue; KASSERT(racct->r_resources[i] == 0, @@ -498,8 +514,8 @@ /* * Increase consumption of 'resource' by 'amount' for 'racct', - * but not its parents. Differently from other cases, 'amount' here - * may be less than zero. + * but not its parents. Differently from other cases, 'amount' + * here may be less than zero. */ static void racct_adjust_resource(struct racct *racct, int resource, @@ -526,9 +542,10 @@ * returns for a thread more than 100% cpu usage. So we set a sane * boundary here to 100% * the maxumum number of CPUs. */ - if ((resource == RACCT_PCTCPU) && - (racct->r_resources[RACCT_PCTCPU] > 100 * 1000000 * (int64_t)MAXCPU)) + if (!racct_pctcpu_throttle_not_deny && resource == RACCT_PCTCPU && + racct->r_resources[RACCT_PCTCPU] > 100 * 1000000 * (int64_t)MAXCPU) { racct->r_resources[RACCT_PCTCPU] = 100 * 1000000 * (int64_t)MAXCPU; + } } static int @@ -612,16 +629,11 @@ mtx_unlock(&racct_lock); } -/* - * Increase allocation of 'resource' by 'amount' for process 'p'. - * Doesn't check for limits and never fails. - */ -void -racct_add_force(struct proc *p, int resource, uint64_t amount) +static void +racct_add_force_locked(struct proc *p, int resource, uint64_t amount) { - if (!racct_enable) - return; + ASSERT_RACCT_ENABLED(); SDT_PROBE3(racct, , rusage, add__force, p, resource, amount); @@ -630,12 +642,48 @@ */ PROC_LOCK_ASSERT(p, MA_OWNED); - mtx_lock(&racct_lock); + /* + * Ignore the return value - we can't return error, but RCTL might + * eg. throttle the process. + */ + (void)rctl_enforce(p, resource, amount); + racct_adjust_resource(p->p_racct, resource, amount); racct_add_cred_locked(p->p_ucred, resource, amount); +} + +/* + * Increase allocation of 'resource' by 'amount' for process 'p'. + * Checks for limits, but never fails. + */ +void +racct_add_force(struct proc *p, int resource, uint64_t amount) +{ + + if (!racct_enable) + return; + + mtx_lock(&racct_lock); + racct_add_force_locked(p, resource, amount); mtx_unlock(&racct_lock); } +void +racct_add_buf(struct proc *p, const struct buf *bp, int is_write) +{ + + ASSERT_RACCT_ENABLED(); + PROC_LOCK_ASSERT(p, MA_OWNED); + + if (is_write) { + racct_add_force(curproc, RACCT_WRITEBPS, bp->b_bcount); + racct_add_force(curproc, RACCT_WRITEIOPS, 1); + } else { + racct_add_force(curproc, RACCT_READBPS, bp->b_bcount); + racct_add_force(curproc, RACCT_READIOPS, 1); + } +} + static int racct_set_locked(struct proc *p, int resource, uint64_t amount) { @@ -659,7 +707,7 @@ * The diffs may be negative. */ diff_proc = amount - old_amount; - if (RACCT_IS_DECAYING(resource)) { + if (!racct_pctcpu_throttle_not_deny && RACCT_IS_DECAYING(resource)) { /* * Resources in per-credential racct containers may decay. * If this is the case, we need to calculate the difference @@ -688,7 +736,7 @@ racct_adjust_resource(p->p_racct, resource, diff_proc); if (diff_cred > 0) racct_add_cred_locked(p->p_ucred, resource, diff_cred); - else if (diff_cred < 0) + else if (RACCT_IS_RECLAIMABLE(resource) && diff_cred < 0) racct_sub_cred_locked(p->p_ucred, resource, -diff_cred); return (0); @@ -735,7 +783,7 @@ * The diffs may be negative. */ diff_proc = amount - old_amount; - if (RACCT_IS_DECAYING(resource)) { + if (!racct_pctcpu_throttle_not_deny && RACCT_IS_DECAYING(resource)) { /* * Resources in per-credential racct containers may decay. * If this is the case, we need to calculate the difference @@ -750,7 +798,7 @@ racct_adjust_resource(p->p_racct, resource, diff_proc); if (diff_cred > 0) racct_add_cred_locked(p->p_ucred, resource, diff_cred); - else if (diff_cred < 0) + else if (RACCT_IS_RECLAIMABLE(resource) && diff_cred < 0) racct_sub_cred_locked(p->p_ucred, resource, -diff_cred); } @@ -850,7 +898,8 @@ (intmax_t)p->p_racct->r_resources[resource], p->p_comm, p->p_pid)); racct_adjust_resource(p->p_racct, resource, -amount); - racct_sub_cred_locked(p->p_ucred, resource, amount); + if (RACCT_IS_RECLAIMABLE(resource)) + racct_sub_cred_locked(p->p_ucred, resource, amount); mtx_unlock(&racct_lock); } @@ -993,13 +1042,17 @@ #endif microuptime(&wallclock); timevalsub(&wallclock, &p->p_stats->p_start); - if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) { - pct_estimate = (1000000 * runtime * 100) / - ((uint64_t)wallclock.tv_sec * 1000000 + - wallclock.tv_usec); - } else - pct_estimate = 0; - pct = racct_getpcpu(p, pct_estimate); + if (!racct_pctcpu_throttle_not_deny) { + if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) { + pct_estimate = (1000000 * runtime * 100) / + ((uint64_t)wallclock.tv_sec * 1000000 + + wallclock.tv_usec); + } else + pct_estimate = 0; + pct = racct_getpcpu(p, pct_estimate); + } else { + pct = (runtime - p->p_prev_runtime) * 100; + } mtx_lock(&racct_lock); racct_set_locked(p, RACCT_CPU, runtime); @@ -1084,14 +1137,15 @@ mtx_unlock(&racct_lock); } -static void -racct_proc_throttle(struct proc *p) +void +racct_proc_throttle(struct proc *p, int timeout) { struct thread *td; #ifdef SMP int cpuid; #endif + KASSERT(timeout != 0, ("timeout %d", timeout)); ASSERT_RACCT_ENABLED(); PROC_LOCK_ASSERT(p, MA_OWNED); @@ -1099,10 +1153,12 @@ * Do not block kernel processes. Also do not block processes with * low %cpu utilization to improve interactivity. */ - if (((p->p_flag & (P_SYSTEM | P_KPROC)) != 0) || - (p->p_racct->r_resources[RACCT_PCTCPU] <= pcpu_threshold)) + if ((p->p_flag & (P_SYSTEM | P_KPROC)) != 0) return; - p->p_throttled = 1; + if (p->p_throttled < 0 || (timeout > 0 && p->p_throttled > timeout)) + return; + + p->p_throttled = timeout; FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); @@ -1143,30 +1199,37 @@ PROC_LOCK_ASSERT(p, MA_OWNED); - if (p->p_throttled) { + if (p->p_throttled != 0) { p->p_throttled = 0; wakeup(p->p_racct); } } static void -racct_decay_resource(struct racct *racct, void * res, void* dummy) +racct_decay_callback(struct racct *racct, void *res, void *dummy) { - int resource; int64_t r_old, r_new; ASSERT_RACCT_ENABLED(); mtx_assert(&racct_lock, MA_OWNED); - resource = *(int *)res; - r_old = racct->r_resources[resource]; + rctl_throttle_decay(racct, RACCT_READBPS); + rctl_throttle_decay(racct, RACCT_WRITEBPS); + rctl_throttle_decay(racct, RACCT_READIOPS); + rctl_throttle_decay(racct, RACCT_WRITEIOPS); + + if (racct_pctcpu_throttle_not_deny) { + rctl_throttle_decay(racct, RACCT_PCTCPU); + } else { + r_old = racct->r_resources[RACCT_PCTCPU]; + + /* If there is nothing to decay, just exit. */ + if (r_old <= 0) + return; - /* If there is nothing to decay, just exit. */ - if (r_old <= 0) - return; - - r_new = r_old * RACCT_DECAY_FACTOR / FSCALE; - racct->r_resources[resource] = r_new; + r_new = r_old * RACCT_DECAY_FACTOR / FSCALE; + racct->r_resources[RACCT_PCTCPU] = r_new; + } } static void @@ -1184,17 +1247,17 @@ } static void -racct_decay(int resource) +racct_decay(void) { ASSERT_RACCT_ENABLED(); - ui_racct_foreach(racct_decay_resource, racct_decay_pre, - racct_decay_post, &resource, NULL); - loginclass_racct_foreach(racct_decay_resource, racct_decay_pre, - racct_decay_post, &resource, NULL); - prison_racct_foreach(racct_decay_resource, racct_decay_pre, - racct_decay_post, &resource, NULL); + ui_racct_foreach(racct_decay_callback, racct_decay_pre, + racct_decay_post, NULL, NULL); + loginclass_racct_foreach(racct_decay_callback, racct_decay_pre, + racct_decay_post, NULL, NULL); + prison_racct_foreach(racct_decay_callback, racct_decay_pre, + racct_decay_post, NULL, NULL); } static void @@ -1205,18 +1268,22 @@ struct timeval wallclock; uint64_t runtime; uint64_t pct, pct_estimate; + bool throttle; + //int error; ASSERT_RACCT_ENABLED(); for (;;) { - racct_decay(RACCT_PCTCPU); + racct_decay(); sx_slock(&allproc_lock); - LIST_FOREACH(p, &zombproc, p_list) { - PROC_LOCK(p); - racct_set(p, RACCT_PCTCPU, 0); - PROC_UNLOCK(p); + if (!racct_pctcpu_throttle_not_deny) { + LIST_FOREACH(p, &zombproc, p_list) { + PROC_LOCK(p); + racct_set(p, RACCT_PCTCPU, 0); + PROC_UNLOCK(p); + } } FOREACH_PROC_IN_SYSTEM(p) { @@ -1240,24 +1307,63 @@ if (runtime < p->p_prev_runtime) runtime = p->p_prev_runtime; #endif - p->p_prev_runtime = runtime; - if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) { - pct_estimate = (1000000 * runtime * 100) / - ((uint64_t)wallclock.tv_sec * 1000000 + - wallclock.tv_usec); - } else - pct_estimate = 0; - pct = racct_getpcpu(p, pct_estimate); + if (!racct_pctcpu_throttle_not_deny) { + if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) { + pct_estimate = (1000000 * runtime * 100) / + ((uint64_t)wallclock.tv_sec * 1000000 + + wallclock.tv_usec); + } else + pct_estimate = 0; + pct = racct_getpcpu(p, pct_estimate); + } else { + pct = (runtime - p->p_prev_runtime) * 100; + } mtx_lock(&racct_lock); - racct_set_force_locked(p, RACCT_PCTCPU, pct); + + rctl_throttle_decay(p->p_racct, RACCT_READBPS); + rctl_throttle_decay(p->p_racct, RACCT_WRITEBPS); + rctl_throttle_decay(p->p_racct, RACCT_READIOPS); + rctl_throttle_decay(p->p_racct, RACCT_WRITEIOPS); + if (racct_pctcpu_throttle_not_deny) { + rctl_throttle_decay(p->p_racct, RACCT_PCTCPU); + /* + * Preferably we would bump the counters + * from statclock. We cannot, however - to call + * racct_add() we need the proc lock, and we + * cannot acquire that in statclock(). + * + * Note that 1% basically means 10 milliseconds. + */ + racct_add_force_locked(p, RACCT_PCTCPU, pct); + } else { + racct_set_force_locked(p, RACCT_PCTCPU, pct); + } + p->p_prev_runtime = runtime; + +#if 0 + if (racct_pctcpu_throttle_not_deny) { + /* + * Ignore the return value; there cannot + * be a "deny" rule there. + */ + error = rctl_enforce(p, RACCT_PCTCPU, 0); + KASSERT(error == 0, + ("rctl_enforce() returned %d", error)); + } +#endif + racct_set_locked(p, RACCT_CPU, runtime); racct_set_locked(p, RACCT_WALLCLOCK, (uint64_t)wallclock.tv_sec * 1000000 + wallclock.tv_usec); + mtx_unlock(&racct_lock); PROC_UNLOCK(p); } + if (racct_pctcpu_throttle_not_deny) + goto done; + /* * To ensure that processes are throttled in a fair way, we need * to iterate over all processes again and check the limits @@ -1271,12 +1377,18 @@ continue; } - if (racct_pcpu_available(p) <= 0) - racct_proc_throttle(p); - else if (p->p_throttled) + throttle = false; + if (racct_pcpu_available(p) <= 0 && + (p->p_racct->r_resources[RACCT_PCTCPU] > pcpu_threshold)) + throttle = true; + + if (throttle) + racct_proc_throttle(p, -1); + else if (p->p_throttled == -1) racct_proc_wakeup(p); PROC_UNLOCK(p); } +done: sx_sunlock(&allproc_lock); pause("-", hz); } Index: sys/kern/kern_rctl.c =================================================================== --- sys/kern/kern_rctl.c +++ sys/kern/kern_rctl.c @@ -77,11 +77,31 @@ #define RCTL_PCPU_SHIFT (10 * 1000000) -unsigned int rctl_maxbufsize = RCTL_MAX_OUTBUFSIZE; +static unsigned int rctl_maxbufsize = RCTL_MAX_OUTBUFSIZE; +static int rctl_log_ratelimit = 10; +static int rctl_devctl_ratelimit = 10; +static unsigned int rctl_throttle_min = 0; +static unsigned int rctl_throttle_max = 0; +static unsigned int rctl_throttle_pct = 0; +static unsigned int rctl_throttle_pct2 = 0; SYSCTL_NODE(_kern_racct, OID_AUTO, rctl, CTLFLAG_RW, 0, "Resource Limits"); SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, maxbufsize, CTLFLAG_RWTUN, &rctl_maxbufsize, 0, "Maximum output buffer size"); +SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, rctl_log_ratelimit, CTLFLAG_RW, + &rctl_log_ratelimit, 0, "Maximum number of log messages per second"); +SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, rctl_devctl_ratelimit, CTLFLAG_RW, + &rctl_devctl_ratelimit, 0, "Maximum number of devctl messages per second"); +SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, throttle_min, CTLFLAG_RDTUN, + &rctl_throttle_min, 0, "Shortest throttling duration, in hz"); +SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, throttle_max, CTLFLAG_RDTUN, + &rctl_throttle_max, 0, "Longest throttling duration, in hz"); +SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, throttle_pct, CTLFLAG_RDTUN, + &rctl_throttle_pct, 0, + "Throttling penalty for process consumption, in percent"); +SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, throttle_pct2, CTLFLAG_RDTUN, + &rctl_throttle_pct2, 0, + "Throttling penalty for container consumption, in percent"); /* * 'rctl_rule_link' connects a rule with every racct it's related to. @@ -128,6 +148,10 @@ { "shmsize", RACCT_SHMSIZE }, { "wallclock", RACCT_WALLCLOCK }, { "pcpu", RACCT_PCTCPU }, + { "readbps", RACCT_READBPS }, + { "writebps", RACCT_WRITEBPS }, + { "readiops", RACCT_READIOPS }, + { "writeiops", RACCT_WRITEIOPS }, { NULL, -1 }}; static struct dict actionnames[] = { @@ -165,6 +189,7 @@ { "deny", RCTL_ACTION_DENY }, { "log", RCTL_ACTION_LOG }, { "devctl", RCTL_ACTION_DEVCTL }, + { "throttle", RCTL_ACTION_THROTTLE }, { NULL, -1 }}; static void rctl_init(void); @@ -219,66 +244,96 @@ panic("rctl_resource_name: unknown resource %d", resource); } -/* - * Return the amount of resource that can be allocated by 'p' before - * hitting 'rule'. - */ -static int64_t -rctl_available_resource(const struct proc *p, const struct rctl_rule *rule) +static struct racct * +rctl_proc_rule_to_racct(const struct proc *p, const struct rctl_rule *rule) { - int resource; - int64_t available = INT64_MAX; struct ucred *cred = p->p_ucred; ASSERT_RACCT_ENABLED(); rw_assert(&rctl_lock, RA_LOCKED); - resource = rule->rr_resource; switch (rule->rr_per) { case RCTL_SUBJECT_TYPE_PROCESS: - available = rule->rr_amount - - p->p_racct->r_resources[resource]; - break; + return (p->p_racct); case RCTL_SUBJECT_TYPE_USER: - available = rule->rr_amount - - cred->cr_ruidinfo->ui_racct->r_resources[resource]; - break; + return (cred->cr_ruidinfo->ui_racct); case RCTL_SUBJECT_TYPE_LOGINCLASS: - available = rule->rr_amount - - cred->cr_loginclass->lc_racct->r_resources[resource]; - break; + return (cred->cr_loginclass->lc_racct); case RCTL_SUBJECT_TYPE_JAIL: - available = rule->rr_amount - - cred->cr_prison->pr_prison_racct->prr_racct-> - r_resources[resource]; - break; + return (cred->cr_prison->pr_prison_racct->prr_racct); default: - panic("rctl_compute_available: unknown per %d", - rule->rr_per); + panic("%s: unknown per %d", __func__, rule->rr_per); } +} + +/* + * Return the amount of resource that can be allocated by 'p' before + * hitting 'rule'. + */ +static int64_t +rctl_available_resource(const struct proc *p, const struct rctl_rule *rule) +{ + int64_t available = INT64_MAX; + const struct racct *racct; + + ASSERT_RACCT_ENABLED(); + rw_assert(&rctl_lock, RA_LOCKED); + mtx_assert(&racct_lock, MA_OWNED); + + racct = rctl_proc_rule_to_racct(p, rule); + available = rule->rr_amount - racct->r_resources[rule->rr_resource]; return (available); } /* - * Return non-zero if allocating 'amount' by proc 'p' would exceed - * resource limit specified by 'rule'. + * Called every second for proc, uidinfo, loginclass, and jail containers. + * If the limit wasn't exceeded, it decreases the usage amount to zero. + * Otherwise, it decreases it by the value of the limit. This way + * resource consumption exceeding the limit "carries over" to the next + * period. */ -static int -rctl_would_exceed(const struct proc *p, const struct rctl_rule *rule, - int64_t amount) +void +rctl_throttle_decay(struct racct *racct, int resource) { - int64_t available; + struct rctl_rule *rule; + struct rctl_rule_link *link; + int64_t minavailable; - ASSERT_RACCT_ENABLED(); + mtx_assert(&racct_lock, MA_OWNED); - rw_assert(&rctl_lock, RA_LOCKED); + minavailable = INT64_MAX; - available = rctl_available_resource(p, rule); - if (available >= amount) - return (0); + rw_rlock(&rctl_lock); - return (1); + LIST_FOREACH(link, &racct->r_rule_links, rrl_next) { + rule = link->rrl_rule; + + if (rule->rr_resource != resource) + continue; + if (rule->rr_action != RCTL_ACTION_THROTTLE) + continue; + + if (rule->rr_amount < minavailable) + minavailable = rule->rr_amount; + } + + rw_runlock(&rctl_lock); + + if (racct->r_resources[resource] < minavailable) { + racct->r_resources[resource] = 0; + } else { + /* + * Cap utilization counter at ten times the limit. Otherwise, + * if we accumulated the resource before adding the limiting + * rule, it could take unreasonably long time for the counter + * value to drop. + */ + if (racct->r_resources[resource] > minavailable * 10) + racct->r_resources[resource] = minavailable * 10; + else + racct->r_resources[resource] -= minavailable; + } } /* @@ -328,6 +383,38 @@ return (minavailable); } +static uint64_t +xadd(uint64_t a, uint64_t b) +{ + uint64_t c; + + c = a + b; + + /* + * Detect overflow. + */ + if (c < a || c < b) + return (UINT64_MAX); + + return (c); +} + +static uint64_t +xmul(uint64_t a, uint64_t b) +{ + uint64_t c; + + if (a == 0 || b == 0) + return (0); + + c = a * b; + + if (c < a || c < b) + return (UINT64_MAX); + + return (c); +} + /* * Check whether the proc 'p' can allocate 'amount' of 'resource' in addition * to what it keeps allocated now. Returns non-zero if the allocation should @@ -339,12 +426,16 @@ struct rctl_rule *rule; struct rctl_rule_link *link; struct sbuf sb; + int64_t available; + uint64_t sleep_ms, sleep_ratio; int should_deny = 0; char *buf; - static int curtime = 0; - static struct timeval lasttime; + + static int log_curtime = 0, devctl_curtime = 0; + static struct timeval log_lasttime, devctl_lasttime; ASSERT_RACCT_ENABLED(); + mtx_assert(&racct_lock, MA_OWNED); rw_rlock(&rctl_lock); @@ -356,7 +447,9 @@ rule = link->rrl_rule; if (rule->rr_resource != resource) continue; - if (!rctl_would_exceed(p, rule, amount)) { + + available = rctl_available_resource(p, rule); + if (available >= (int64_t)amount) { link->rrl_exceeded = 0; continue; } @@ -383,7 +476,8 @@ if (p->p_state != PRS_NORMAL) continue; - if (!ppsratecheck(&lasttime, &curtime, 10)) + if (!ppsratecheck(&log_lasttime, &log_curtime, + rctl_log_ratelimit)) continue; buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT); @@ -408,7 +502,11 @@ if (p->p_state != PRS_NORMAL) continue; - + + if (!ppsratecheck(&devctl_lasttime, &devctl_curtime, + rctl_devctl_ratelimit)) + continue; + buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT); if (buf == NULL) { printf("rctl_enforce: out of memory\n"); @@ -427,6 +525,69 @@ free(buf, M_RCTL); link->rrl_exceeded = 1; continue; + case RCTL_ACTION_THROTTLE: + if (p->p_state != PRS_NORMAL) + continue; + + /* + * Make the process sleep for a fraction of second + * proportional to the ratio of process' resource + * utilization compared to the limit. The point is + * to penalize resource hogs: processes that consume + * more of the available resources sleep for longer. + * + * We're trying to defer division until the very end, + * to minimize the rounding effects. The following + * calculation could have been written in a clearer + * way like this: + * + * sleep_ms = hz * p->p_racct->r_resources[resource] / + * rule->rr_amount; + * sleep_ms *= rctl_throttle_pct / 100; + * if (sleep_ms < rctl_throttle_min) + * sleep_ms = rctl_throttle_min; + * + */ + sleep_ms = xmul(hz, p->p_racct->r_resources[resource]); + sleep_ms = xmul(sleep_ms, rctl_throttle_pct) / 100; + if (sleep_ms < rctl_throttle_min * rule->rr_amount) + sleep_ms = rctl_throttle_min * rule->rr_amount; + + /* + * Multiply that by the ratio of the resource + * consumption for the container compared to the limit, + * squared. In other words, a process in a container + * that is two times over the limit will be throttled + * four times as much for hitting the same rule. The + * point is to penalize processes more if the container + * itself (eg certain UID or jail) is above the limit. + */ + if (available < 0) + sleep_ratio = -available / rule->rr_amount; + else + sleep_ratio = 0; + sleep_ratio = xmul(sleep_ratio, sleep_ratio); + sleep_ratio = xmul(sleep_ratio, rctl_throttle_pct2) / 100; + sleep_ms = xadd(sleep_ms, xmul(sleep_ms, sleep_ratio)); + + /* + * Finally the division. + */ + sleep_ms /= rule->rr_amount; + + if (sleep_ms > rctl_throttle_max) + sleep_ms = rctl_throttle_max; +#if 0 + printf("%s: pid %d (%s), %jd of %jd, will sleep for %ld ms (ratio %ld, available %ld)\n", + __func__, p->p_pid, p->p_comm, + p->p_racct->r_resources[resource], + rule->rr_amount, sleep_ms, sleep_ratio, available); +#endif + + KASSERT(sleep_ms >= rctl_throttle_min, ("%s: %ld < %d\n", + __func__, sleep_ms, rctl_throttle_min)); + racct_proc_throttle(p, sleep_ms); + continue; default: if (link->rrl_exceeded != 0) continue; @@ -642,6 +803,9 @@ if ((size_t)(end - str) != strlen(str)) return (EINVAL); + if (*value <= 0) + return (ERANGE); + return (0); } @@ -1008,8 +1172,13 @@ error = str2int64(amountstr, &rule->rr_amount); if (error != 0) goto out; - if (RACCT_IS_IN_MILLIONS(rule->rr_resource)) + if (RACCT_IS_IN_MILLIONS(rule->rr_resource)) { + if (rule->rr_amount > INT64_MAX / 1000000) { + error = ERANGE; + goto out; + } rule->rr_amount *= 1000000; + } } if (perstr == NULL || perstr[0] == '\0') @@ -1048,20 +1217,27 @@ KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified")); /* - * Some rules just don't make sense. Note that the one below - * cannot be rewritten using RACCT_IS_DENIABLE(); the RACCT_PCTCPU, - * for example, is not deniable in the racct sense, but the - * limit is enforced in a different way, so "deny" rules for %CPU - * do make sense. + * Some rules just don't make sense, like "deny" rule for an undeniable + * resource. The exception are the RSS and %CPU resources - they are + * not deniable in the racct sense, but the limit is enforced in + * a different way. */ if (rule->rr_action == RCTL_ACTION_DENY && - (rule->rr_resource == RACCT_CPU || - rule->rr_resource == RACCT_WALLCLOCK)) + !RACCT_IS_DENIABLE(rule->rr_resource) && + rule->rr_resource != RACCT_RSS && + rule->rr_resource != RACCT_PCTCPU) { return (EOPNOTSUPP); + } + + if (rule->rr_action == RCTL_ACTION_THROTTLE && + !RACCT_IS_DECAYING(rule->rr_resource)) { + return (EOPNOTSUPP); + } if (rule->rr_per == RCTL_SUBJECT_TYPE_PROCESS && - RACCT_IS_SLOPPY(rule->rr_resource)) + RACCT_IS_SLOPPY(rule->rr_resource)) { return (EOPNOTSUPP); + } /* * Make sure there are no duplicated rules. Also, for the "deny" @@ -1346,7 +1522,9 @@ for (i = 0; i <= RACCT_MAX; i++) { if (sloppy == 0 && RACCT_IS_SLOPPY(i)) continue; + mtx_lock(&racct_lock); amount = racct->r_resources[i]; + mtx_unlock(&racct_lock); if (RACCT_IS_IN_MILLIONS(i)) amount /= 1000000; sbuf_printf(sb, "%s=%jd,", rctl_resource_name(i), amount); @@ -1619,6 +1797,81 @@ return (error); } +static void +rctl_rule_find_callback(struct racct *racct, void *arg2, void *arg3) +{ + struct rctl_rule *filter = (struct rctl_rule *)arg2; + struct rctl_rule_link *link; + int found = 0; + + ASSERT_RACCT_ENABLED(); + rw_assert(&rctl_lock, RA_LOCKED); + + LIST_FOREACH(link, &racct->r_rule_links, rrl_next) { + if (!rctl_rule_matches(link->rrl_rule, filter)) + continue; + found++; + } + + *((int *)arg3) += found; +} + +/* + * Try to set racct_pctcpu_throttle_not_deny to the value given. + * It can only be set to '1' if there are no 'pcpu:deny' rules, + * and to '0' only if there are no 'pcpu:throttle" rules. + */ +static int +rctl_set_pctcpu_throttle_not_deny(int val) +{ + struct rctl_rule *filter; + struct proc *p; + int found; + + if (racct_pctcpu_throttle_not_deny == val) + return (0); + + /* + * The purpose of this lock here is not only to be able to iterate + * over processes, but also to avoid race with rctl_rule_add() + * adding a clashing rule after we do the search, but before we set + * the value. Besides, rctl_rule_add() holds this lock anyway. + */ + sx_assert(&allproc_lock, SA_LOCKED); + + filter = rctl_rule_alloc(M_WAITOK); + filter->rr_resource = RACCT_PCTCPU; + if (val == 1) + filter->rr_action = RCTL_ACTION_DENY; + else + filter->rr_action = RCTL_ACTION_THROTTLE; + + found = 0; + + rctl_rule_pre_callback(); + FOREACH_PROC_IN_SYSTEM(p) + rctl_rule_find_callback(p->p_racct, filter, &found); + rctl_rule_post_callback(); + + loginclass_racct_foreach(rctl_rule_find_callback, + rctl_rule_pre_callback, rctl_rule_post_callback, + filter, &found); + ui_racct_foreach(rctl_rule_find_callback, + rctl_rule_pre_callback, rctl_rule_post_callback, + filter, &found); + prison_racct_foreach(rctl_rule_find_callback, + rctl_rule_pre_callback, rctl_rule_post_callback, + filter, &found); + + rctl_rule_release(filter); + + if (found != 0) + return (EOPNOTSUPP); + + racct_pctcpu_throttle_not_deny = val; + return (0); +} + int sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap) { @@ -1656,6 +1909,19 @@ goto out; } + if (rule->rr_resource == RACCT_PCTCPU) { + if (rule->rr_action == RCTL_ACTION_DENY) { + error = rctl_set_pctcpu_throttle_not_deny(0); + if (error != 0) + goto out; + } + if (rule->rr_action == RCTL_ACTION_THROTTLE) { + error = rctl_set_pctcpu_throttle_not_deny(1); + if (error != 0) + goto out; + } + } + error = rctl_rule_add(rule); out: @@ -1935,6 +2201,15 @@ UMA_ALIGN_PTR, UMA_ZONE_NOFREE); rctl_rule_zone = uma_zcreate("rctl_rule", sizeof(struct rctl_rule), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + + if (rctl_throttle_min <= 0) + rctl_throttle_min = 1; + if (rctl_throttle_max <= 0) + rctl_throttle_max = 2 * hz; + if (rctl_throttle_pct <= 0) + rctl_throttle_pct = 100; + if (rctl_throttle_pct2 <= 0) + rctl_throttle_pct2 = 100; } #else /* !RCTL */ Index: sys/kern/subr_trap.c =================================================================== --- sys/kern/subr_trap.c +++ sys/kern/subr_trap.c @@ -172,10 +172,14 @@ (td->td_vnet_lpush != NULL) ? td->td_vnet_lpush : "N/A")); #endif #ifdef RACCT - if (racct_enable && p->p_throttled == 1) { + if (racct_enable && p->p_throttled != 0) { PROC_LOCK(p); - while (p->p_throttled == 1) - msleep(p->p_racct, &p->p_mtx, 0, "racct", 0); + while (p->p_throttled != 0) { + msleep(p->p_racct, &p->p_mtx, 0, "racct", + p->p_throttled < 0 ? 0 : p->p_throttled); + if (p->p_throttled > 0) + p->p_throttled = 0; + } PROC_UNLOCK(p); } #endif Index: sys/kern/vfs_bio.c =================================================================== --- sys/kern/vfs_bio.c +++ sys/kern/vfs_bio.c @@ -61,6 +61,7 @@ #include #include #include +#include #include #include #include @@ -1784,8 +1785,16 @@ rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0); if ((rabp->b_flags & B_CACHE) == 0) { - if (!TD_IS_IDLETHREAD(curthread)) + if (!TD_IS_IDLETHREAD(curthread)) { +#ifdef RACCT + if (racct_enable) { + PROC_LOCK(curproc); + racct_add_buf(curproc, rabp, 0); + PROC_UNLOCK(curproc); + } +#endif /* RACCT */ curthread->td_ru.ru_inblock++; + } rabp->b_flags |= B_ASYNC; rabp->b_flags &= ~B_INVAL; rabp->b_ioflags &= ~BIO_ERROR; @@ -1829,8 +1838,16 @@ /* if not found in cache, do some I/O */ if ((bp->b_flags & B_CACHE) == 0) { - if (!TD_IS_IDLETHREAD(curthread)) + if (!TD_IS_IDLETHREAD(curthread)) { +#ifdef RACCT + if (racct_enable) { + PROC_LOCK(curproc); + racct_add_buf(curproc, bp, 0); + PROC_UNLOCK(curproc); + } +#endif /* RACCT */ curthread->td_ru.ru_inblock++; + } bp->b_iocmd = BIO_READ; bp->b_flags &= ~B_INVAL; bp->b_ioflags &= ~BIO_ERROR; @@ -1926,8 +1943,16 @@ bp->b_runningbufspace = bp->b_bufsize; space = atomic_fetchadd_long(&runningbufspace, bp->b_runningbufspace); - if (!TD_IS_IDLETHREAD(curthread)) + if (!TD_IS_IDLETHREAD(curthread)) { +#ifdef RACCT + if (racct_enable) { + PROC_LOCK(curproc); + racct_add_buf(curproc, bp, 1); + PROC_UNLOCK(curproc); + } +#endif /* RACCT */ curthread->td_ru.ru_oublock++; + } if (oldflags & B_ASYNC) BUF_KERNPROC(bp); bp->b_iooffset = dbtob(bp->b_blkno); Index: sys/kern/vfs_cluster.c =================================================================== --- sys/kern/vfs_cluster.c +++ sys/kern/vfs_cluster.c @@ -45,6 +45,7 @@ #include #include #include +#include #include #include #include @@ -241,6 +242,13 @@ BUF_KERNPROC(bp); bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); +#ifdef RACCT + if (racct_enable) { + PROC_LOCK(curproc); + racct_add_buf(curproc, bp, 0); + PROC_UNLOCK(curproc); + } +#endif /* RACCT */ curthread->td_ru.ru_inblock++; } @@ -294,6 +302,13 @@ BUF_KERNPROC(rbp); rbp->b_iooffset = dbtob(rbp->b_blkno); bstrategy(rbp); +#ifdef RACCT + if (racct_enable) { + PROC_LOCK(curproc); + racct_add_buf(curproc, rbp, 0); + PROC_UNLOCK(curproc); + } +#endif /* RACCT */ curthread->td_ru.ru_inblock++; } Index: sys/sys/proc.h =================================================================== --- sys/sys/proc.h +++ sys/sys/proc.h @@ -620,7 +620,7 @@ after fork. */ uint64_t p_prev_runtime; /* (c) Resource usage accounting. */ struct racct *p_racct; /* (b) Resource accounting. */ - u_char p_throttled; /* (c) Flag for racct pcpu throttling */ + int p_throttled; /* (c) Flag for racct pcpu throttling */ struct vm_domain_policy p_vm_dom_policy; /* (c) process default VM domain, or -1 */ /* * An orphan is the child that has beed re-parented to the Index: sys/sys/racct.h =================================================================== --- sys/sys/racct.h +++ sys/sys/racct.h @@ -42,6 +42,7 @@ #include #include +struct buf; struct proc; struct rctl_rule_link; struct ucred; @@ -71,7 +72,11 @@ #define RACCT_SHMSIZE 18 #define RACCT_WALLCLOCK 19 #define RACCT_PCTCPU 20 -#define RACCT_MAX RACCT_PCTCPU +#define RACCT_READBPS 21 +#define RACCT_WRITEBPS 22 +#define RACCT_READIOPS 23 +#define RACCT_WRITEIOPS 24 +#define RACCT_MAX RACCT_WRITEIOPS /* * Resource properties. @@ -85,6 +90,7 @@ extern int racct_types[]; extern int racct_enable; +extern int racct_pctcpu_throttle_not_deny; #define ASSERT_RACCT_ENABLED() KASSERT(racct_enable, \ ("%s called with !racct_enable", __func__)) @@ -98,7 +104,7 @@ /* * Resource usage can drop, as opposed to only grow. When the process - * terminates, its resource usage is freed from the respective + * terminates, its resource usage is subtracted from the respective * per-credential racct containers. */ #define RACCT_IS_RECLAIMABLE(X) (racct_types[X] & RACCT_RECLAIMABLE) @@ -126,8 +132,7 @@ * When a process terminates, its resource usage is not automatically * subtracted from per-credential racct containers. Instead, the resource * usage of per-credential racct containers decays in time. - * Resource usage can olso drop for such resource. - * So far, the only such resource is RACCT_PCTCPU. + * Resource usage can also drop for such resource. */ #define RACCT_IS_DECAYING(X) (racct_types[X] & RACCT_DECAYING) @@ -149,11 +154,14 @@ SYSCTL_DECL(_kern_racct); +extern struct mtx racct_lock; + #ifdef RACCT int racct_add(struct proc *p, int resource, uint64_t amount); void racct_add_cred(struct ucred *cred, int resource, uint64_t amount); void racct_add_force(struct proc *p, int resource, uint64_t amount); +void racct_add_buf(struct proc *p, const struct buf *bufp, int is_write); int racct_set(struct proc *p, int resource, uint64_t amount); void racct_set_force(struct proc *p, int resource, uint64_t amount); void racct_sub(struct proc *p, int resource, uint64_t amount); @@ -171,6 +179,7 @@ void racct_proc_ucred_changed(struct proc *p, struct ucred *oldcred, struct ucred *newcred); void racct_move(struct racct *dest, struct racct *src); +void racct_proc_throttle(struct proc *p, int timeout); #else Index: sys/sys/rctl.h =================================================================== --- sys/sys/rctl.h +++ sys/sys/rctl.h @@ -129,7 +129,8 @@ #define RCTL_ACTION_DENY (RCTL_ACTION_SIGNAL_MAX + 1) #define RCTL_ACTION_LOG (RCTL_ACTION_SIGNAL_MAX + 2) #define RCTL_ACTION_DEVCTL (RCTL_ACTION_SIGNAL_MAX + 3) -#define RCTL_ACTION_MAX RCTL_ACTION_DEVCTL +#define RCTL_ACTION_THROTTLE (RCTL_ACTION_SIGNAL_MAX + 4) +#define RCTL_ACTION_MAX RCTL_ACTION_THROTTLE #define RCTL_AMOUNT_UNDEFINED -1 @@ -140,6 +141,7 @@ int rctl_rule_add(struct rctl_rule *rule); int rctl_rule_remove(struct rctl_rule *filter); int rctl_enforce(struct proc *p, int resource, uint64_t amount); +void rctl_throttle_decay(struct racct *racct, int resource); int64_t rctl_pcpu_available(const struct proc *p); uint64_t rctl_get_limit(struct proc *p, int resource); uint64_t rctl_get_available(struct proc *p, int resource); Index: sys/ufs/ffs/ffs_inode.c =================================================================== --- sys/ufs/ffs/ffs_inode.c +++ sys/ufs/ffs/ffs_inode.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -659,6 +660,13 @@ vp = ITOV(ip); bp = getblk(vp, lbn, (int)fs->fs_bsize, 0, 0, 0); if ((bp->b_flags & B_CACHE) == 0) { +#ifdef RACCT + if (racct_enable) { + PROC_LOCK(curproc); + racct_add_buf(curproc, bp, 0); + PROC_UNLOCK(curproc); + } +#endif /* RACCT */ curthread->td_ru.ru_inblock++; /* pay for read */ bp->b_iocmd = BIO_READ; bp->b_flags &= ~B_INVAL; Index: sys/ufs/ffs/ffs_softdep.c =================================================================== --- sys/ufs/ffs/ffs_softdep.c +++ sys/ufs/ffs/ffs_softdep.c @@ -69,6 +69,7 @@ #include #include #include +#include #include #include #include @@ -6229,6 +6230,13 @@ vfs_busy_pages(bp, 0); bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); +#ifdef RACCT + if (racct_enable) { + PROC_LOCK(curproc); + racct_add_buf(curproc, bp, 0); + PROC_UNLOCK(curproc); + } +#endif /* RACCT */ curthread->td_ru.ru_inblock++; error = bufwait(bp); if (error) { Index: sys/ufs/ufs/ufs_bmap.c =================================================================== --- sys/ufs/ufs/ufs_bmap.c +++ sys/ufs/ufs/ufs_bmap.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include @@ -223,6 +224,13 @@ vfs_busy_pages(bp, 0); bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); +#ifdef RACCT + if (racct_enable) { + PROC_LOCK(curproc); + racct_add_buf(curproc, bp, 0); + PROC_UNLOCK(curproc); + } +#endif /* RACCT */ curthread->td_ru.ru_inblock++; error = bufwait(bp); if (error) { Index: sys/vm/vm_fault.c =================================================================== --- sys/vm/vm_fault.c +++ sys/vm/vm_fault.c @@ -83,6 +83,7 @@ #include #include #include +#include #include #include #include @@ -994,6 +995,21 @@ if (hardfault) { PCPU_INC(cnt.v_io_faults); curthread->td_ru.ru_majflt++; +#ifdef RACCT + if (racct_enable && fs.object->type == OBJT_VNODE) { + PROC_LOCK(curproc); + if ((fault_type & (VM_PROT_COPY | VM_PROT_WRITE)) != 0) { + racct_add_force(curproc, RACCT_WRITEBPS, + PAGE_SIZE + behind * PAGE_SIZE); + racct_add_force(curproc, RACCT_WRITEIOPS, 1); + } else { + racct_add_force(curproc, RACCT_READBPS, + PAGE_SIZE + ahead * PAGE_SIZE); + racct_add_force(curproc, RACCT_READIOPS, 1); + } + PROC_UNLOCK(curproc); + } +#endif } else curthread->td_ru.ru_minflt++; Index: usr.bin/rctl/rctl.8 =================================================================== --- usr.bin/rctl/rctl.8 +++ usr.bin/rctl/rctl.8 @@ -25,7 +25,7 @@ .\" .\" $FreeBSD$ .\" -.Dd November 29, 2015 +.Dd January 30, 2016 .Dt RCTL 8 .Os .Sh NAME @@ -204,14 +204,22 @@ .It Sy shmsize Ta "SysV shared memory size, in bytes" .It Sy wallclock Ta "wallclock time, in seconds" .It Sy pcpu Ta "%CPU, in percents of a single CPU core" +.It Sy readbps Ta "filesystem reads, in bytes per second" +.It Sy writebps Ta "filesystem writes, in bytes per second" +.It Sy readiops Ta "filesystem reads, in operations per second" +.It Sy writeiops Ta "filesystem writes, in operations per second" .El .Sh ACTIONS .Bl -column -offset 3n "pseudoterminals" .It Em action .It Sy deny Ta deny the allocation; not supported for -.Sy cputime +.Sy cputime , +.Sy wallclock , +.Sy readbps , +.Sy writebps , +.Sy readiops , and -.Sy wallclock +.Sy writeiops .It Sy log Ta "log a warning to the console" .It Sy devctl Ta "send notification to" .Xr devd 8 @@ -228,6 +236,19 @@ See .Xr signal 3 for a list of supported signals +.It Sy throttle Ta "slow down process execution"; only supported for +.Sy readbps , +.Sy writebps , +.Sy readiops , +and +.Sy writeiops . +Note that one cannot have both +.Sy throttle +and +.Sy deny +rules for the +.Sy pcpu +resource at the same time. .El .Pp Not all actions are supported for all resources. @@ -287,3 +308,22 @@ Limiting .Sy memoryuse may kill the machine due to thrashing. +.Pp +The +.Sy readiops +and +.Sy writeiops +counters are only approximations. +Like +.Sy readbps +and +.Sy writebps , +they are calculated in the filesystem layer, where it is difficult +or even impossible to observe actual disk device operations. +.Pp +The +.Sy writebps +and +.Sy writeiops +resources generally account for writes to the filesystem cache, +not to actual devices.