diff --git a/sys/kern/kern_racct.c b/sys/kern/kern_racct.c --- a/sys/kern/kern_racct.c +++ b/sys/kern/kern_racct.c @@ -113,6 +113,10 @@ "struct proc *", "const struct buf *", "int"); SDT_PROBE_DEFINE3(racct, , rusage, add__cred, "struct ucred *", "int", "uint64_t"); +SDT_PROBE_DEFINE3(racct, , rusage, add__cred__checked, + "struct ucred *", "int", "uint64_t"); +SDT_PROBE_DEFINE3(racct, , rusage, add__cred__checked__failure, + "struct ucred *", "int", "uint64_t"); SDT_PROBE_DEFINE3(racct, , rusage, add__force, "struct proc *", "int", "uint64_t"); SDT_PROBE_DEFINE3(racct, , rusage, set, @@ -628,7 +632,6 @@ void racct_add_cred(struct ucred *cred, int resource, uint64_t amount) { - if (!racct_enable) return; @@ -639,6 +642,46 @@ RACCT_UNLOCK(); } +static int +racct_add_cred_checked_locked(struct ucred *cred, int resource, uint64_t amount) +{ +#ifdef RCTL + int error; +#endif + + ASSERT_RACCT_ENABLED(); + +#ifdef RCTL + error = rctl_enforce_cred(cred, resource, amount); + if (error && RACCT_IS_DENIABLE(resource)) { + SDT_PROBE3(racct, , rusage, add__cred__checked__failure, cred, + resource, amount); + return (error); + } +#endif + racct_add_cred_locked(cred, resource, amount); + + return (0); +} + +/* + * Increase allocation of 'resource' by 'amount' for credential 'cred'. + * Return 0 if it's below limits, or errno, if it's not. + */ +int +racct_add_cred_checked(struct ucred *cred, int resource, uint64_t amount) +{ + int error; + if (!racct_enable) + return (0); + + SDT_PROBE3(racct, , rusage, add__cred__checked, cred, resource, amount); + RACCT_LOCK(); + error = racct_add_cred_checked_locked(cred, resource, amount); + RACCT_UNLOCK(); + return (error); +} + /* * Account for disk IO resource consumption. Checks for limits, * but never fails, due to disk limits being undeniable. diff --git a/sys/kern/kern_rctl.c b/sys/kern/kern_rctl.c --- a/sys/kern/kern_rctl.c +++ b/sys/kern/kern_rctl.c @@ -221,6 +221,8 @@ static int rctl_rule_fully_specified(const struct rctl_rule *rule); static void rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule); +static int rctl_enforce_racct(struct racct *racct, int resource, uint64_t amount, + struct ucred *cred); static MALLOC_DEFINE(M_RCTL, "rctl", "Resource Limits"); @@ -332,16 +334,13 @@ } static struct racct * -rctl_proc_rule_to_racct(const struct proc *p, const struct rctl_rule *rule) +rctl_proc_rule_to_racct_cred(const struct ucred *cred, + const struct rctl_rule *rule) { - struct ucred *cred = p->p_ucred; - - ASSERT_RACCT_ENABLED(); - RACCT_LOCK_ASSERT(); + KASSERT(rule->rr_per != RCTL_SUBJECT_TYPE_PROCESS, + ("rctl_proc_rule_to_racct_cred: cannot get process racct")); switch (rule->rr_per) { - case RCTL_SUBJECT_TYPE_PROCESS: - return (p->p_racct); case RCTL_SUBJECT_TYPE_USER: return (cred->cr_ruidinfo->ui_racct); case RCTL_SUBJECT_TYPE_LOGINCLASS: @@ -353,6 +352,22 @@ } } +static struct racct * +rctl_proc_rule_to_racct(const struct proc *p, const struct rctl_rule *rule) +{ + struct ucred *cred = p->p_ucred; + + ASSERT_RACCT_ENABLED(); + RACCT_LOCK_ASSERT(); + + switch (rule->rr_per) { + case RCTL_SUBJECT_TYPE_PROCESS: + return (p->p_racct); + default: + return (rctl_proc_rule_to_racct_cred(cred, rule)); + } +} + /* * Return the amount of resource that can be allocated by 'p' before * hitting 'rule'. @@ -372,6 +387,26 @@ return (available); } +/* + * Return the amount of resource that can be allocated by 'cred' before + * hitting 'rule'. + */ +static int64_t +rctl_available_resource_cred(const struct ucred *cred, + const struct rctl_rule *rule) +{ + const struct racct *racct; + int64_t available; + + ASSERT_RACCT_ENABLED(); + RACCT_LOCK_ASSERT(); + + racct = rctl_proc_rule_to_racct_cred(cred, rule); + available = rule->rr_amount - racct->r_resources[rule->rr_resource]; + + return (available); +} + /* * Called every second for proc, uidinfo, loginclass, and jail containers. * If the limit isn't exceeded, it decreases the usage amount to zero. @@ -489,6 +524,258 @@ return (a * b); } +/* + * Check whether the credential 'cred' can allocate 'amount' of 'resource' in + * addition to what it keeps allocated now. Returns non-zero if the allocation + * should be denied, 0 otherwise. Does not enforce rules whose actions require + * a process, i.e., throttle and sig*. + */ +int +rctl_enforce_cred(struct ucred *cred, int resource, uint64_t amount) +{ + int error = 0; + error |= rctl_enforce_racct(cred->cr_ruidinfo->ui_racct, + resource, amount, cred); + error |= rctl_enforce_racct(cred->cr_loginclass->lc_racct, + resource, amount, cred); + error |= rctl_enforce_racct(cred->cr_prison->pr_prison_racct->prr_racct, + resource, amount, cred); + return (error); +} + +static void +rctl_log_handler(struct rctl_rule *rule, struct proc *p, struct ucred *cred) +{ + static struct timeval log_lasttime; + static int log_curtime = 0; + struct sbuf sb; + char *buf; + + if (!ppsratecheck(&log_lasttime, &log_curtime, + rctl_log_rate_limit)) + return; + + if (p) { + cred = p->p_ucred; + } + buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT); + if (buf == NULL) { + printf("rctl_enforce_racct: out of memory\n"); + return; + } + sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN); + rctl_rule_to_sbuf(&sb, rule); + sbuf_finish(&sb); + if (p) { + printf("rctl: rule \"%s\" matched by pid %d " + "(%s), uid %d, jail %s\n", sbuf_data(&sb), + p->p_pid, p->p_comm, p->p_ucred->cr_uid, + p->p_ucred->cr_prison->pr_prison_racct->prr_name); + } + else { + printf("rctl: rule \"%s\" matched by uid %d, jail %s\n", + sbuf_data(&sb), cred->cr_uid, + cred->cr_prison->pr_prison_racct->prr_name); + } + sbuf_delete(&sb); + free(buf, M_RCTL); +} + +static void +rctl_devctl_handler(struct rctl_rule *rule, struct proc *p, struct ucred *cred) +{ + static struct timeval devctl_lasttime; + static int devctl_curtime = 0; + struct sbuf sb; + char *buf; + + if (!ppsratecheck(&devctl_lasttime, &devctl_curtime, + rctl_devctl_rate_limit)) + return; + + if (p) { + cred = p->p_ucred; + } + buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT); + if (buf == NULL) { + printf("rctl_enforce_racct: out of memory\n"); + return; + } + sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN); + sbuf_printf(&sb, "rule="); + rctl_rule_to_sbuf(&sb, rule); + if (p) { + sbuf_printf(&sb, " pid=%d ruid=%d jail=%s", + p->p_pid, p->p_ucred->cr_ruid, + p->p_ucred->cr_prison->pr_prison_racct->prr_name); + } + else { + sbuf_printf(&sb, " ruid=%d jail=%s", cred->cr_ruid, + cred->cr_prison->pr_prison_racct->prr_name); + } + sbuf_finish(&sb); + devctl_notify("RCTL", "rule", "matched", + sbuf_data(&sb)); + sbuf_delete(&sb); + free(buf, M_RCTL); +} + +static void +rctl_throttle_handler(struct rctl_rule *rule, int resource, struct proc * p) +{ + uint64_t sleep_ms, sleep_ratio; + int64_t available; + + if (rule->rr_amount == 0) { + racct_proc_throttle(p, rctl_throttle_max); + return; + } + + /* + * Make the process sleep for a fraction of second + * proportional to the ratio of process' resource + * utilization compared to the limit. The point is + * to penalize resource hogs: processes that consume + * more of the available resources sleep for longer. + * + * We're trying to defer division until the very end, + * to minimize the rounding effects. The following + * calculation could have been written in a clearer + * way like this: + * + * sleep_ms = hz * p->p_racct->r_resources[resource] / + * rule->rr_amount; + * sleep_ms *= rctl_throttle_pct / 100; + * if (sleep_ms < rctl_throttle_min) + * sleep_ms = rctl_throttle_min; + * + */ + sleep_ms = xmul(hz, p->p_racct->r_resources[resource]); + sleep_ms = xmul(sleep_ms, rctl_throttle_pct) / 100; + if (sleep_ms < rctl_throttle_min * rule->rr_amount) + sleep_ms = rctl_throttle_min * rule->rr_amount; + + /* + * Multiply that by the ratio of the resource + * consumption for the container compared to the limit, + * squared. In other words, a process in a container + * that is two times over the limit will be throttled + * four times as much for hitting the same rule. The + * point is to penalize processes more if the container + * itself (eg certain UID or jail) is above the limit. + */ + available = rctl_available_resource(p, rule); + if (available < 0) + sleep_ratio = -available / rule->rr_amount; + else + sleep_ratio = 0; + sleep_ratio = xmul(sleep_ratio, sleep_ratio); + sleep_ratio = xmul(sleep_ratio, rctl_throttle_pct2) / 100; + sleep_ms = xadd(sleep_ms, xmul(sleep_ms, sleep_ratio)); + + /* + * Finally the division. + */ + sleep_ms /= rule->rr_amount; + + if (sleep_ms > rctl_throttle_max) + sleep_ms = rctl_throttle_max; +#if 0 + printf("%s: pid %d (%s), %jd of %jd, will sleep for %ju ms (ratio %ju, available %jd)\n", + __func__, p->p_pid, p->p_comm, + p->p_racct->r_resources[resource], + rule->rr_amount, (uintmax_t)sleep_ms, + (uintmax_t)sleep_ratio, (intmax_t)available); +#endif + + KASSERT(sleep_ms >= rctl_throttle_min, ("%s: %ju < %d\n", + __func__, (uintmax_t)sleep_ms, rctl_throttle_min)); + racct_proc_throttle(p, sleep_ms); +} + +static void +rctl_sig_handler(struct rctl_rule *rule, struct proc *p) +{ + KASSERT(rule->rr_action > 0 && + rule->rr_action <= RCTL_ACTION_SIGNAL_MAX, + ("rctl_sig: unknown action %d", + rule->rr_action)); + + /* + * We're using the fact that RCTL_ACTION_SIG* values + * are equal to their counterparts from sys/signal.h. + */ + kern_psignal(p, rule->rr_action); +} + +static int +rctl_enforce_racct(struct racct *racct, int resource, uint64_t amount, struct ucred *cred) +{ + struct rctl_rule *rule; + struct rctl_rule_link *link; + int64_t available; + int should_deny = 0; + + ASSERT_RACCT_ENABLED(); + RACCT_LOCK_ASSERT(); + + /* + * There may be more than one matching rule; go through all of them. + * Denial should be done last, after logging and sending signals. + */ + LIST_FOREACH(link, &racct->r_rule_links, rrl_next) { + rule = link->rrl_rule; + if (rule->rr_resource != resource) + continue; + if (rule->rr_per == RCTL_SUBJECT_TYPE_PROCESS) + continue; + + available = rctl_available_resource_cred(cred, rule); + if (available >= (int64_t)amount) { + link->rrl_exceeded = 0; + continue; + } + + switch (rule->rr_action) { + case RCTL_ACTION_DENY: + should_deny = 1; + continue; + case RCTL_ACTION_LOG: + /* + * If rrl_exceeded != 0, it means we've already + * logged a warning for this process. + */ + if (link->rrl_exceeded != 0) + continue; + + rctl_log_handler(rule, NULL, cred); + link->rrl_exceeded = 1; + continue; + case RCTL_ACTION_DEVCTL: + if (link->rrl_exceeded != 0) + continue; + + rctl_devctl_handler(rule, NULL, cred); + link->rrl_exceeded = 1; + continue; + case RCTL_ACTION_THROTTLE: + continue; + default: + continue; + } + } + + if (should_deny) { + /* + * Return fake error code; the caller should change it + * into one proper for the situation - EFSIZ, ENOMEM etc. + */ + return (EDOOFUS); + } + + return (0); +} + /* * Check whether the proc 'p' can allocate 'amount' of 'resource' in addition * to what it keeps allocated now. Returns non-zero if the allocation should @@ -497,14 +784,9 @@ int rctl_enforce(struct proc *p, int resource, uint64_t amount) { - static struct timeval log_lasttime, devctl_lasttime; - static int log_curtime = 0, devctl_curtime = 0; struct rctl_rule *rule; struct rctl_rule_link *link; - struct sbuf sb; - char *buf; int64_t available; - uint64_t sleep_ms, sleep_ratio; int should_deny = 0; ASSERT_RACCT_ENABLED(); @@ -547,24 +829,7 @@ if (p->p_state != PRS_NORMAL) continue; - if (!ppsratecheck(&log_lasttime, &log_curtime, - rctl_log_rate_limit)) - continue; - - buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT); - if (buf == NULL) { - printf("rctl_enforce: out of memory\n"); - continue; - } - sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN); - rctl_rule_to_sbuf(&sb, rule); - sbuf_finish(&sb); - printf("rctl: rule \"%s\" matched by pid %d " - "(%s), uid %d, jail %s\n", sbuf_data(&sb), - p->p_pid, p->p_comm, p->p_ucred->cr_uid, - p->p_ucred->cr_prison->pr_prison_racct->prr_name); - sbuf_delete(&sb); - free(buf, M_RCTL); + rctl_log_handler(rule, p, NULL); link->rrl_exceeded = 1; continue; case RCTL_ACTION_DEVCTL: @@ -574,96 +839,14 @@ if (p->p_state != PRS_NORMAL) continue; - if (!ppsratecheck(&devctl_lasttime, &devctl_curtime, - rctl_devctl_rate_limit)) - continue; - - buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT); - if (buf == NULL) { - printf("rctl_enforce: out of memory\n"); - continue; - } - sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN); - sbuf_printf(&sb, "rule="); - rctl_rule_to_sbuf(&sb, rule); - sbuf_printf(&sb, " pid=%d ruid=%d jail=%s", - p->p_pid, p->p_ucred->cr_ruid, - p->p_ucred->cr_prison->pr_prison_racct->prr_name); - sbuf_finish(&sb); - devctl_notify("RCTL", "rule", "matched", - sbuf_data(&sb)); - sbuf_delete(&sb); - free(buf, M_RCTL); + rctl_devctl_handler(rule, p, NULL); link->rrl_exceeded = 1; continue; case RCTL_ACTION_THROTTLE: if (p->p_state != PRS_NORMAL) continue; - if (rule->rr_amount == 0) { - racct_proc_throttle(p, rctl_throttle_max); - continue; - } - - /* - * Make the process sleep for a fraction of second - * proportional to the ratio of process' resource - * utilization compared to the limit. The point is - * to penalize resource hogs: processes that consume - * more of the available resources sleep for longer. - * - * We're trying to defer division until the very end, - * to minimize the rounding effects. The following - * calculation could have been written in a clearer - * way like this: - * - * sleep_ms = hz * p->p_racct->r_resources[resource] / - * rule->rr_amount; - * sleep_ms *= rctl_throttle_pct / 100; - * if (sleep_ms < rctl_throttle_min) - * sleep_ms = rctl_throttle_min; - * - */ - sleep_ms = xmul(hz, p->p_racct->r_resources[resource]); - sleep_ms = xmul(sleep_ms, rctl_throttle_pct) / 100; - if (sleep_ms < rctl_throttle_min * rule->rr_amount) - sleep_ms = rctl_throttle_min * rule->rr_amount; - - /* - * Multiply that by the ratio of the resource - * consumption for the container compared to the limit, - * squared. In other words, a process in a container - * that is two times over the limit will be throttled - * four times as much for hitting the same rule. The - * point is to penalize processes more if the container - * itself (eg certain UID or jail) is above the limit. - */ - if (available < 0) - sleep_ratio = -available / rule->rr_amount; - else - sleep_ratio = 0; - sleep_ratio = xmul(sleep_ratio, sleep_ratio); - sleep_ratio = xmul(sleep_ratio, rctl_throttle_pct2) / 100; - sleep_ms = xadd(sleep_ms, xmul(sleep_ms, sleep_ratio)); - - /* - * Finally the division. - */ - sleep_ms /= rule->rr_amount; - - if (sleep_ms > rctl_throttle_max) - sleep_ms = rctl_throttle_max; -#if 0 - printf("%s: pid %d (%s), %jd of %jd, will sleep for %ju ms (ratio %ju, available %jd)\n", - __func__, p->p_pid, p->p_comm, - p->p_racct->r_resources[resource], - rule->rr_amount, (uintmax_t)sleep_ms, - (uintmax_t)sleep_ratio, (intmax_t)available); -#endif - - KASSERT(sleep_ms >= rctl_throttle_min, ("%s: %ju < %d\n", - __func__, (uintmax_t)sleep_ms, rctl_throttle_min)); - racct_proc_throttle(p, sleep_ms); + rctl_throttle_handler(rule, resource, p); continue; default: if (link->rrl_exceeded != 0) @@ -672,16 +855,7 @@ if (p->p_state != PRS_NORMAL) continue; - KASSERT(rule->rr_action > 0 && - rule->rr_action <= RCTL_ACTION_SIGNAL_MAX, - ("rctl_enforce: unknown action %d", - rule->rr_action)); - - /* - * We're using the fact that RCTL_ACTION_SIG* values - * are equal to their counterparts from sys/signal.h. - */ - kern_psignal(p, rule->rr_action); + rctl_sig_handler(rule, p); link->rrl_exceeded = 1; continue; } diff --git a/sys/sys/racct.h b/sys/sys/racct.h --- a/sys/sys/racct.h +++ b/sys/sys/racct.h @@ -176,6 +176,8 @@ int racct_add(struct proc *p, int resource, uint64_t amount); void racct_add_cred(struct ucred *cred, int resource, uint64_t amount); +int racct_add_cred_checked(struct ucred *cred, int resource, + uint64_t amount); void racct_add_force(struct proc *p, int resource, uint64_t amount); void racct_add_buf(struct proc *p, const struct buf *bufp, int is_write); int racct_set(struct proc *p, int resource, uint64_t amount); diff --git a/sys/sys/rctl.h b/sys/sys/rctl.h --- a/sys/sys/rctl.h +++ b/sys/sys/rctl.h @@ -142,10 +142,12 @@ int rctl_rule_add(struct rctl_rule *rule); int rctl_rule_remove(struct rctl_rule *filter); int rctl_enforce(struct proc *p, int resource, uint64_t amount); +int rctl_enforce_cred(struct ucred *cred, int resource, uint64_t amount); void rctl_throttle_decay(struct racct *racct, int resource); int64_t rctl_pcpu_available(const struct proc *p); uint64_t rctl_get_limit(struct proc *p, int resource); uint64_t rctl_get_available(struct proc *p, int resource); +uint64_t rctl_get_available_cred(struct ucred *cred, int resource); const char *rctl_resource_name(int resource); void rctl_proc_ucred_changed(struct proc *p, struct ucred *newcred); int rctl_proc_fork(struct proc *parent, struct proc *child);