diff --git a/head/sys/amd64/conf/GENERIC b/head/sys/amd64/conf/GENERIC --- a/head/sys/amd64/conf/GENERIC +++ b/head/sys/amd64/conf/GENERIC @@ -88,6 +88,8 @@ options INCLUDE_CONFIG_FILE # Include this file in kernel options RACCT # Resource accounting framework options RACCT_DEFAULT_TO_DISABLED # Set kern.racct.enable=0 by default +options RACCT_RT # Realtime cputime calc for all objects +options RACCT_RT_PCTCPU # Also realtime %cpu calc for all objects options RCTL # Resource limits # Debugging support. Always need this: diff --git a/head/sys/conf/options b/head/sys/conf/options --- a/head/sys/conf/options +++ b/head/sys/conf/options @@ -962,6 +962,8 @@ # Resource Accounting RACCT opt_global.h RACCT_DEFAULT_TO_DISABLED opt_global.h +RACCT_RT opt_global.h +RACCT_RT_PCTCPU opt_global.h # Resource Limits RCTL opt_global.h diff --git a/head/sys/kern/kern_clock.c b/head/sys/kern/kern_clock.c --- a/head/sys/kern/kern_clock.c +++ b/head/sys/kern/kern_clock.c @@ -58,6 +58,7 @@ #include #include #include +#include #include #include #include @@ -707,6 +708,10 @@ runtime = new_switchtime - PCPU_GET(switchtime); td->td_runtime += runtime; td->td_incruntime += runtime; +#if defined(RACCT) && defined(RACCT_RT) + if (RACCT_ENABLED()) + racct_rt_add_thread_runtime(td, runtime); +#endif PCPU_SET(switchtime, new_switchtime); sched_clock(td, cnt); diff --git a/head/sys/kern/kern_jail.c b/head/sys/kern/kern_jail.c --- a/head/sys/kern/kern_jail.c +++ b/head/sys/kern/kern_jail.c @@ -2239,6 +2239,7 @@ char *errmsg, *name; int drflags, error, errmsg_len, errmsg_pos, i, jid, len, pos; unsigned f; + uint64_t r_us, r_uspersec; if (flags & ~JAIL_GET_MASK) return (EINVAL); @@ -2449,6 +2450,20 @@ error = vfs_setopts(opts, "osrelease", pr->pr_osrelease); if (error != 0 && error != ENOENT) goto done; +#if defined(RACCT) && defined(RACCT_RT) + if (RACCT_ENABLED()) + racct_rt_get_runtime(pr->pr_prison_racct->prr_racct, &r_us, + &r_uspersec, NULL); + else +#endif + r_us = r_uspersec = 0; + error = vfs_setopt(opts, "racct.rt.us", &r_us, sizeof(r_us)); + if (error != 0 && error != ENOENT) + goto done; + error = vfs_setopt(opts, "racct.rt.uspersec", &r_uspersec, + sizeof(r_uspersec)); + if (error != 0 && error != ENOENT) + goto done; /* Get the module parameters. */ mtx_unlock(&pr->pr_mtx); diff --git a/head/sys/kern/kern_proc.c b/head/sys/kern/kern_proc.c --- a/head/sys/kern/kern_proc.c +++ b/head/sys/kern/kern_proc.c @@ -58,6 +58,7 @@ #include #include #include +#include #include #include #include @@ -274,6 +275,7 @@ mtx_init(&p->p_statmtx, "pstatl", NULL, MTX_SPIN | MTX_NEW); mtx_init(&p->p_itimmtx, "pitiml", NULL, MTX_SPIN | MTX_NEW); mtx_init(&p->p_profmtx, "pprofl", NULL, MTX_SPIN | MTX_NEW); + mtx_init(&p->p_rtmtx, "p_rt_l", NULL, MTX_SPIN | MTX_NEW); cv_init(&p->p_pwait, "ppwait"); TAILQ_INIT(&p->p_threads); /* all threads in proc */ EVENTHANDLER_DIRECT_INVOKE(process_init, p); @@ -1220,6 +1222,9 @@ fill_kinfo_thread(struct thread *td, struct kinfo_proc *kp, int preferthread) { struct proc *p; +#if defined(RACCT) && defined(RACCT_RT) && defined(RACCT_RT_PCTCPU) + uint64_t uspersec; +#endif p = td->td_proc; kp->ki_tdaddr = td; @@ -1307,6 +1312,14 @@ rufetchtd(td, &kp->ki_rusage); kp->ki_runtime = cputick2usec(td->td_rux.rux_runtime); kp->ki_pctcpu = sched_pctcpu(td); +#if defined(RACCT) && defined(RACCT_RT) && defined(RACCT_RT_PCTCPU) + if (RACCT_ENABLED()) { + racct_rt_get_thread_runtime(td, &uspersec, NULL); + /* XXX temporary hack for testing */ + kp->ki_sparelongs[1] = kp->ki_pctcpu; + kp->ki_pctcpu = (uint64_t)FSCALE * uspersec / 1000000; + } +#endif kp->ki_estcpu = sched_estcpu(td); kp->ki_cow = td->td_cow; } @@ -1329,6 +1342,9 @@ void fill_kinfo_proc(struct proc *p, struct kinfo_proc *kp) { +#if defined(RACCT) && defined(RACCT_RT) && defined(RACCT_RT_PCTCPU) + uint64_t uspersec; +#endif MPASS(FIRST_THREAD_IN_PROC(p) != NULL); bzero(kp, sizeof(*kp)); @@ -1337,6 +1353,14 @@ fill_kinfo_proc_only(p, kp); fill_kinfo_thread(FIRST_THREAD_IN_PROC(p), kp, 0); fill_kinfo_aggregate(p, kp); +#if defined(RACCT) && defined(RACCT_RT) && defined(RACCT_RT_PCTCPU) + if (RACCT_ENABLED()) { + racct_rt_get_runtime(p->p_racct, NULL, &uspersec, NULL); + /* XXX temporary hack for testing */ + kp->ki_sparelongs[1] = kp->ki_pctcpu; + kp->ki_pctcpu = (uint64_t)FSCALE * uspersec / 1000000; + } +#endif } struct pstats * diff --git a/head/sys/kern/kern_racct.c b/head/sys/kern/kern_racct.c --- a/head/sys/kern/kern_racct.c +++ b/head/sys/kern/kern_racct.c @@ -463,6 +463,9 @@ KASSERT(*racctp == NULL, ("racct already allocated")); *racctp = uma_zalloc(racct_zone, M_WAITOK | M_ZERO); +#ifdef RACCT_RT + mtx_init(&(*racctp)->r_rtmtx, "racct rtlock", NULL, MTX_SPIN | MTX_NEW); +#endif } static void @@ -491,6 +494,9 @@ "%ju allocated for resource %d\n", racct->r_resources[i], i)); } +#ifdef RACCT_RT + mtx_destroy(&racct->r_rtmtx); +#endif uma_zfree(racct_zone, racct); *racctp = NULL; } @@ -1363,4 +1369,167 @@ } SYSINIT(racct, SI_SUB_RACCT, SI_ORDER_FIRST, racct_init, NULL); +#ifdef RACCT_RT +/* + * This inline used only inside racct_rt_add_thread_runtime() + * It uses these local variables from racct_rt_add_thread_runtime(): + * int t; + * u_int dt, gap; + */ +#ifndef RACCT_RT_PCTCPU +#define RACCT_ADD_RUNTIME(var, racct, add) \ +do { \ + (var) = (racct); \ + RACCT_RTLOCK(var); \ + (var)->r_runtime += (add); \ + RACCT_RTUNLOCK(var); \ +} while (0) +#else +#define RACCT_ADD_RUNTIME(var, racct, add) \ +do { \ + (var) = (racct); \ + RACCT_RTLOCK(var); \ + (var)->r_runtime += (add); \ + dt = t - (var)->r_rtlastticks; \ + if (dt + gap > gap) { \ + if (dt >= (u_int)hz) \ + (var)->r_rtpersec = 0; \ + else \ + (var)->r_rtpersec = (var)->r_rtpersec * (hz - (dt)) / \ + hz + (var)->r_ltruntime; \ + (var)->r_ltruntime = 0; \ + (var)->r_rtlastticks = (t); \ + } \ + (var)->r_ltruntime += (add); \ + RACCT_RTUNLOCK(var); \ +} while (0) +#endif + +void +racct_rt_add_thread_runtime(struct thread *td, uint64_t add) +{ + struct proc *p; + struct ucred *cred; + struct racct *racct; + struct prison *pr; +#ifdef RACCT_RT_PCTCPU + int t = ticks; + u_int dt, gap; +#endif + + MPASS(td == curthread); + p = td->td_proc; +#ifdef RACCT_RT_PCTCPU + PROC_RTLOCK(p); + dt = t - td->td_rtlastticks; + gap = 10 * hz; + if (dt + gap > gap) { + if (dt >= (u_int)hz) + td->td_rtpersec = 0; + else + td->td_rtpersec = td->td_rtpersec * (hz - dt) / hz + + td->td_ltruntime; + td->td_ltruntime = 0; + td->td_rtlastticks = t; + } + td->td_ltruntime += add; + PROC_RTUNLOCK(p); +#endif + RACCT_ADD_RUNTIME(racct, p->p_racct, add); + cred = td->td_ucred; + RACCT_ADD_RUNTIME(racct, cred->cr_ruidinfo->ui_racct, add); + for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) + RACCT_ADD_RUNTIME(racct, pr->pr_prison_racct->prr_racct, add); + RACCT_ADD_RUNTIME(racct, cred->cr_loginclass->lc_racct, add); +} + +void +racct_rt_get_runtime(struct racct *racct, uint64_t *us_total, + uint64_t *us_persec, uint64_t *us_idle) +{ + uint64_t rt; +#ifdef RACCT_RT_PCTCPU + uint64_t rtps; + uint64_t ltrt; + int dt; +#endif + ASSERT_RACCT_ENABLED(); + RACCT_RTLOCK(racct); + rt = racct->r_runtime; +#ifdef RACCT_RT_PCTCPU + rtps = racct->r_rtpersec; + ltrt = racct->r_ltruntime; + dt = ticks - racct->r_rtlastticks; + if (dt >= hz || dt < -10 * hz) { + racct->r_rtpersec = 0; + racct->r_ltruntime = 0; + } +#endif + RACCT_RTUNLOCK(racct); + if (us_total != NULL) + *us_total = cputick2usec(rt); + if (us_persec != NULL) { +#ifdef RACCT_RT_PCTCPU + if (dt >= hz || dt < -10 * hz) + rtps = 0; + else if (dt > 0) + rtps = rtps * (hz - dt) / hz + ltrt; + *us_persec = cputick2usec(rtps); +#else + *us_persec = 0; +#endif + } + if (us_idle != NULL) { +#ifdef RACCT_RT_PCTCPU + *us_idle = dt * 1000000ULL / hz; +#else + *us_idle = 0; +#endif + } +} + +void +racct_rt_get_thread_runtime(struct thread *td, uint64_t *us_persec, + uint64_t *us_idle) +{ + struct proc *p; +#ifdef RACCT_RT_PCTCPU + uint64_t rtps; + uint64_t ltrt; + int dt; +#endif + ASSERT_RACCT_ENABLED(); +#ifdef RACCT_RT_PCTCPU + p = td->td_proc; + PROC_RTLOCK(p); + rtps = td->td_rtpersec; + ltrt = td->td_ltruntime; + dt = ticks - td->td_rtlastticks; + if (dt >= hz || dt < -10 * hz) { + td->td_rtpersec = 0; + td->td_ltruntime = 0; + } + PROC_RTUNLOCK(p); +#endif + if (us_persec != NULL) { +#ifdef RACCT_RT_PCTCPU + if (dt >= hz || dt < -10 * hz) + rtps = 0; + else if (dt > 0) + rtps = rtps * (hz - dt) / hz + ltrt; + *us_persec = cputick2usec(rtps); +#else + *us_persec = 0; +#endif + } + if (us_idle != NULL) { +#ifdef RACCT_RT_PCTCPU + *us_idle = dt * 1000000ULL / hz; +#else + *us_idle = 0; +#endif + } +} +#endif + #endif /* !RACCT */ diff --git a/head/sys/kern/kern_resource.c b/head/sys/kern/kern_resource.c --- a/head/sys/kern/kern_resource.c +++ b/head/sys/kern/kern_resource.c @@ -846,6 +846,10 @@ runtime = u - PCPU_GET(switchtime); td->td_runtime += runtime; td->td_incruntime += runtime; +#if defined(RACCT) && defined(RACCT_RT) + if (RACCT_ENABLED()) + racct_rt_add_thread_runtime(td, runtime); +#endif PCPU_SET(switchtime, u); } /* Make sure the per-thread stats are current. */ @@ -878,6 +882,10 @@ runtime = u - PCPU_GET(switchtime); td->td_runtime += runtime; td->td_incruntime += runtime; +#if defined(RACCT) && defined(RACCT_RT) + if (RACCT_ENABLED()) + racct_rt_add_thread_runtime(td, runtime); +#endif PCPU_SET(switchtime, u); } ruxagg_locked(p, td); diff --git a/head/sys/kern/kern_synch.c b/head/sys/kern/kern_synch.c --- a/head/sys/kern/kern_synch.c +++ b/head/sys/kern/kern_synch.c @@ -52,6 +52,7 @@ #include #include #include +#include #include #include #include @@ -530,6 +531,10 @@ runtime = new_switchtime - PCPU_GET(switchtime); td->td_runtime += runtime; td->td_incruntime += runtime; +#if defined(RACCT) && defined(RACCT_RT) + if (RACCT_ENABLED()) + racct_rt_add_thread_runtime(td, runtime); +#endif PCPU_SET(switchtime, new_switchtime); td->td_generation++; /* bump preempt-detect counter */ VM_CNT_INC(v_swtch); diff --git a/head/sys/kern/kern_thread.c b/head/sys/kern/kern_thread.c --- a/head/sys/kern/kern_thread.c +++ b/head/sys/kern/kern_thread.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -89,19 +90,19 @@ "struct thread KBI td_flags"); _Static_assert(offsetof(struct thread, td_pflags) == 0x110, "struct thread KBI td_pflags"); -_Static_assert(offsetof(struct thread, td_frame) == 0x4a8, +_Static_assert(offsetof(struct thread, td_frame) == 0x4c0, "struct thread KBI td_frame"); -_Static_assert(offsetof(struct thread, td_emuldata) == 0x6b0, +_Static_assert(offsetof(struct thread, td_emuldata) == 0x6d0, "struct thread KBI td_emuldata"); _Static_assert(offsetof(struct proc, p_flag) == 0xb8, "struct proc KBI p_flag"); _Static_assert(offsetof(struct proc, p_pid) == 0xc4, "struct proc KBI p_pid"); -_Static_assert(offsetof(struct proc, p_filemon) == 0x3c8, +_Static_assert(offsetof(struct proc, p_filemon) == 0x3e8, "struct proc KBI p_filemon"); -_Static_assert(offsetof(struct proc, p_comm) == 0x3e0, +_Static_assert(offsetof(struct proc, p_comm) == 0x400, "struct proc KBI p_comm"); -_Static_assert(offsetof(struct proc, p_emuldata) == 0x4c8, +_Static_assert(offsetof(struct proc, p_emuldata) == 0x4e8, "struct proc KBI p_emuldata"); #endif #ifdef __i386__ @@ -109,19 +110,19 @@ "struct thread KBI td_flags"); _Static_assert(offsetof(struct thread, td_pflags) == 0xa4, "struct thread KBI td_pflags"); -_Static_assert(offsetof(struct thread, td_frame) == 0x308, +_Static_assert(offsetof(struct thread, td_frame) == 0x320, "struct thread KBI td_frame"); -_Static_assert(offsetof(struct thread, td_emuldata) == 0x34c, +_Static_assert(offsetof(struct thread, td_emuldata) == 0x364, "struct thread KBI td_emuldata"); _Static_assert(offsetof(struct proc, p_flag) == 0x6c, "struct proc KBI p_flag"); _Static_assert(offsetof(struct proc, p_pid) == 0x78, "struct proc KBI p_pid"); -_Static_assert(offsetof(struct proc, p_filemon) == 0x270, +_Static_assert(offsetof(struct proc, p_filemon) == 0x284, "struct proc KBI p_filemon"); -_Static_assert(offsetof(struct proc, p_comm) == 0x284, +_Static_assert(offsetof(struct proc, p_comm) == 0x298, "struct proc KBI p_comm"); -_Static_assert(offsetof(struct proc, p_emuldata) == 0x310, +_Static_assert(offsetof(struct proc, p_emuldata) == 0x324, "struct proc KBI p_emuldata"); #endif @@ -971,13 +972,20 @@ PROC_UNLOCK(p); PROC_STATLOCK(p); thread_lock(td); +#if !defined(RACCT) || !defined(RACCT_RT) PROC_SUNLOCK(p); +#endif /* Do the same timestamp bookkeeping that mi_switch() would do. */ new_switchtime = cpu_ticks(); runtime = new_switchtime - PCPU_GET(switchtime); td->td_runtime += runtime; td->td_incruntime += runtime; +#if defined(RACCT) && defined(RACCT_RT) + if (RACCT_ENABLED()) + racct_rt_add_thread_runtime(td, runtime); + PROC_SUNLOCK(p); /* protect the code above from proc_reap() */ +#endif PCPU_SET(switchtime, new_switchtime); PCPU_SET(switchticks, ticks); VM_CNT_INC(v_swtch); diff --git a/head/sys/sys/proc.h b/head/sys/sys/proc.h --- a/head/sys/sys/proc.h +++ b/head/sys/sys/proc.h @@ -170,6 +170,7 @@ * y - created at first aio, doesn't change until exit or exec at which * point we are single-threaded and only curthread changes it * z - zombie threads lock + * R - process rt-racct lock * * If the locking key specifies two identifiers (for example, p_pptr) then * either lock is sufficient for read access, but both locks must be held @@ -298,6 +299,10 @@ u_int td_sticks; /* (t) Statclock hits in system mode. */ u_int td_iticks; /* (t) Statclock hits in intr mode. */ u_int td_uticks; /* (t) Statclock hits in user mode. */ + uint64_t td_ltruntime; /* (R) Still unprocessed cpu ticks */ + uint64_t td_rtpersec; /* (R) Avg cpu ticks per second */ + int td_rtlastticks; /* (R) Last recalc hardclock tick */ + int td_pad1; /* XXX avoid realigning all below */ int td_intrval; /* (t) Return value for sleepq. */ sigset_t td_oldsigmask; /* (k) Saved mask from pre sigpause. */ volatile u_int td_generation; /* (k) For detection of preemption */ @@ -653,6 +658,7 @@ struct mtx p_statmtx; /* Lock for the stats */ struct mtx p_itimmtx; /* Lock for the virt/prof timers */ struct mtx p_profmtx; /* Lock for the profiling */ + struct mtx p_rtmtx; /* Lock for the realtime ucred-racct access */ struct ksiginfo *p_ksi; /* Locked by parent proc lock */ sigqueue_t p_sigqueue; /* (c) Sigs not delivered to a td. */ #define p_siglist p_sigqueue.sq_signals @@ -769,6 +775,10 @@ #define PROC_PROFUNLOCK(p) mtx_unlock_spin(&(p)->p_profmtx) #define PROC_PROFLOCK_ASSERT(p, type) mtx_assert(&(p)->p_profmtx, (type)) +#define PROC_RTLOCK(p) mtx_lock_spin(&(p)->p_rtmtx) +#define PROC_RTUNLOCK(p) mtx_unlock_spin(&(p)->p_rtmtx) +#define PROC_RTLOCK_ASSERT(p, type) mtx_assert(&(p)->p_rtmtx, (type)) + /* These flags are kept in p_flag. */ #define P_ADVLOCK 0x00000001 /* Process may hold a POSIX advisory lock. */ diff --git a/head/sys/sys/racct.h b/head/sys/sys/racct.h --- a/head/sys/sys/racct.h +++ b/head/sys/sys/racct.h @@ -149,8 +149,17 @@ * This structure must be filled with zeroes initially. */ struct racct { - int64_t r_resources[RACCT_MAX + 1]; + int64_t r_resources[RACCT_MAX + 1]; LIST_HEAD(, rctl_rule_link) r_rule_links; +#ifdef RACCT_RT + struct mtx r_rtmtx; /* Spin lock for realtime fields */ + uint64_t r_runtime; /* How many cpu ticks we've run */ +#ifdef RACCT_RT_PCTCPU + uint64_t r_ltruntime; /* Still unprocessed cpu ticks */ + uint64_t r_rtpersec; /* Avg cpu ticks per second */ + int r_rtlastticks; /* Last recalc hardclock tick */ +#endif +#endif }; SYSCTL_DECL(_kern_racct); @@ -174,6 +183,12 @@ PROC_UNLOCK(p); \ } while (0) +#ifdef RACCT_RT +#define RACCT_RTLOCK(r) mtx_lock_spin(&(r)->r_rtmtx) +#define RACCT_RTUNLOCK(r) mtx_unlock_spin(&(r)->r_rtmtx) +#define RACCT_RTLOCK_ASSERT(r, type) mtx_assert(&(r)->r_rtmtx, (type)) +#endif + int racct_add(struct proc *p, int resource, uint64_t amount); void racct_add_cred(struct ucred *cred, int resource, uint64_t amount); void racct_add_force(struct proc *p, int resource, uint64_t amount); @@ -198,6 +213,13 @@ void racct_move(struct racct *dest, struct racct *src); void racct_proc_throttled(struct proc *p); void racct_proc_throttle(struct proc *p, int timeout); +#ifdef RACCT_RT +void racct_rt_add_thread_runtime(struct thread *td, uint64_t add); +void racct_rt_get_runtime(struct racct *racct, uint64_t *us_total, + uint64_t *us_persec, uint64_t *us_idle); +void racct_rt_get_thread_runtime(struct thread *td, uint64_t *us_persec, + uint64_t *us_idle); +#endif #else