Index: sys/kern/kern_thread.c =================================================================== --- sys/kern/kern_thread.c +++ sys/kern/kern_thread.c @@ -81,19 +81,19 @@ "struct thread KBI td_flags"); _Static_assert(offsetof(struct thread, td_pflags) == 0x104, "struct thread KBI td_pflags"); -_Static_assert(offsetof(struct thread, td_frame) == 0x470, +_Static_assert(offsetof(struct thread, td_frame) == 0x478, "struct thread KBI td_frame"); -_Static_assert(offsetof(struct thread, td_emuldata) == 0x518, +_Static_assert(offsetof(struct thread, td_emuldata) == 0x520, "struct thread KBI td_emuldata"); -_Static_assert(offsetof(struct proc, p_flag) == 0xb0, +_Static_assert(offsetof(struct proc, p_flag) == 0xb8, "struct proc KBI p_flag"); -_Static_assert(offsetof(struct proc, p_pid) == 0xbc, +_Static_assert(offsetof(struct proc, p_pid) == 0xc4, "struct proc KBI p_pid"); -_Static_assert(offsetof(struct proc, p_filemon) == 0x3d0, +_Static_assert(offsetof(struct proc, p_filemon) == 0x3d8, "struct proc KBI p_filemon"); -_Static_assert(offsetof(struct proc, p_comm) == 0x3e0, +_Static_assert(offsetof(struct proc, p_comm) == 0x3e8, "struct proc KBI p_comm"); -_Static_assert(offsetof(struct proc, p_emuldata) == 0x4b8, +_Static_assert(offsetof(struct proc, p_emuldata) == 0x4c8, "struct proc KBI p_emuldata"); #endif #ifdef __i386__ @@ -101,19 +101,19 @@ "struct thread KBI td_flags"); _Static_assert(offsetof(struct thread, td_pflags) == 0xa0, "struct thread KBI td_pflags"); -_Static_assert(offsetof(struct thread, td_frame) == 0x2e8, +_Static_assert(offsetof(struct thread, td_frame) == 0x2f0, "struct thread KBI td_frame"); -_Static_assert(offsetof(struct thread, td_emuldata) == 0x334, +_Static_assert(offsetof(struct thread, td_emuldata) == 0x33c, "struct thread KBI td_emuldata"); -_Static_assert(offsetof(struct proc, p_flag) == 0x68, +_Static_assert(offsetof(struct proc, p_flag) == 0x70, "struct proc KBI p_flag"); -_Static_assert(offsetof(struct proc, p_pid) == 0x74, +_Static_assert(offsetof(struct proc, p_pid) == 0x7c, "struct proc KBI p_pid"); -_Static_assert(offsetof(struct proc, p_filemon) == 0x27c, +_Static_assert(offsetof(struct proc, p_filemon) == 0x284, "struct proc KBI p_filemon"); -_Static_assert(offsetof(struct proc, p_comm) == 0x288, +_Static_assert(offsetof(struct proc, p_comm) == 0x290, "struct proc KBI p_comm"); -_Static_assert(offsetof(struct proc, p_emuldata) == 0x314, +_Static_assert(offsetof(struct proc, p_emuldata) == 0x31c, "struct proc KBI p_emuldata"); #endif Index: sys/kern/kern_timeout.c =================================================================== --- sys/kern/kern_timeout.c +++ sys/kern/kern_timeout.c @@ -163,15 +163,19 @@ struct cc_exec cc_exec_entity[2]; struct callout *cc_next; struct callout *cc_callout; - struct callout_list *cc_callwheel; + struct callout_list *cc_callwheel_high; + struct callout_list *cc_callwheel_low; struct callout_tailq cc_expireq; struct callout_slist cc_callfree; sbintime_t cc_firstevent; + sbintime_t cc_firstopt; sbintime_t cc_lastscan; void *cc_cookie; - u_int cc_bucket; - u_int cc_inited; +#ifdef KTR char cc_ktr_event_name[20]; +#endif + bool cc_inited; + bool cc_running; }; #define callout_migrating(c) ((c)->c_iflags & CALLOUT_DFRMIGRATION) @@ -319,17 +323,24 @@ mtx_init(&cc->cc_lock, "callout", NULL, MTX_SPIN | MTX_RECURSE); SLIST_INIT(&cc->cc_callfree); - cc->cc_inited = 1; - cc->cc_callwheel = malloc(sizeof(struct callout_list) * callwheelsize, - M_CALLOUT, M_WAITOK); - for (i = 0; i < callwheelsize; i++) - LIST_INIT(&cc->cc_callwheel[i]); + cc->cc_inited = true; + cc->cc_running = false; + cc->cc_callwheel_high = malloc(sizeof(struct callout_list) * + callwheelsize, M_CALLOUT, M_WAITOK); + cc->cc_callwheel_low = malloc(sizeof(struct callout_list) * + callwheelsize, M_CALLOUT, M_WAITOK); + for (i = 0; i < callwheelsize; i++) { + LIST_INIT(&cc->cc_callwheel_high[i]); + LIST_INIT(&cc->cc_callwheel_low[i]); + } TAILQ_INIT(&cc->cc_expireq); cc->cc_firstevent = SBT_MAX; for (i = 0; i < 2; i++) cc_cce_cleanup(cc, i); +#ifdef KTR snprintf(cc->cc_ktr_event_name, sizeof(cc->cc_ktr_event_name), "callwheel cpu %d", cpu); +#endif if (cc->cc_callout == NULL) /* Only BSP handles timeout(9) */ return; for (i = 0; i < ncallout; i++) { @@ -427,6 +438,9 @@ return (sbt >> (32 - CC_HASH_SHIFT)); } +#define CC_BUCKET_WIDTH (1LLU << (32 - CC_HASH_SHIFT)) +#define CC_BUCKET_MASK (CC_BUCKET_WIDTH - 1) + static inline u_int callout_get_bucket(sbintime_t sbt) { @@ -434,25 +448,73 @@ return (callout_hash(sbt) & callwheelmask); } +static inline sbintime_t +callout_trunc_time(sbintime_t sbt) +{ + + return (sbt & (0xffffffffffffffffLLU << (32 - CC_HASH_SHIFT))); +} + +#ifdef CALLOUT_PROFILING +static inline struct callout * +callout_process_single(struct callout_cpu *cc, struct callout *c, int bucket, + int *depth_dir, int *mpcalls_dir, *lockcalls_dir) +#else +static inline struct callout * +callout_process_single(struct callout_cpu *cc, struct callout *c, int bucket) +#endif +{ + struct callout *rv; + + CC_LOCK_ASSERT(cc); + /* + * Consumer told us the callout may be run + * directly from hardware interrupt context. + */ + if (c->c_iflags & CALLOUT_DIRECT) { +#ifdef CALLOUT_PROFILING + ++(*depth_dir); +#endif + cc_exec_next(cc) = LIST_NEXT(c, c_links.le); + LIST_REMOVE(c, c_links.le); + softclock_call_cc(c, cc, +#ifdef CALLOUT_PROFILING + mpcalls_dir, lockcalls_dir, NULL, +#endif + 1); + rv = cc_exec_next(cc); + cc_exec_next(cc) = NULL; + } else { + rv = LIST_NEXT(c, c_links.le); + LIST_REMOVE(c, c_links.le); + TAILQ_INSERT_TAIL(&cc->cc_expireq, c, c_links.tqe); + c->c_iflags |= CALLOUT_PROCESSED; + } + + return (rv); +} + void callout_process(sbintime_t now) { - struct callout *tmp, *tmpn; + struct callout *tmp; struct callout_cpu *cc; struct callout_list *sc; - sbintime_t first, last, max, tmp_max; + sbintime_t first, max; uint32_t lookahead; - u_int firstb, lastb, nowb; + u_int firstb, nowb; #ifdef CALLOUT_PROFILING int depth_dir = 0, mpcalls_dir = 0, lockcalls_dir = 0; #endif + bool done; cc = CC_SELF(); mtx_lock_spin_flags(&cc->cc_lock, MTX_QUIET); /* Compute the buckets of the last scan and present times. */ firstb = callout_hash(cc->cc_lastscan); - cc->cc_lastscan = now; + first = callout_trunc_time(cc->cc_lastscan); + cc->cc_lastscan = now + 1; nowb = callout_hash(now); /* Compute the last bucket and minimum time of the bucket after it. */ @@ -462,90 +524,147 @@ lookahead = (SBT_1S / 8); else lookahead = (SBT_1S / 2); - first = last = now; - first += (lookahead / 2); - last += lookahead; - last &= (0xffffffffffffffffLLU << (32 - CC_HASH_SHIFT)); - lastb = callout_hash(last) - 1; - max = last; + cc->cc_firstopt = now + (lookahead / 2); + cc->cc_firstevent = callout_trunc_time(now + lookahead); + max = cc->cc_firstevent - 1; + cc->cc_running = true; /* - * Check if we wrapped around the entire wheel from the last scan. - * In case, we need to scan entirely the wheel for pending callouts. + * Deal with a case where we have been asleep so long we + * need to scan all the buckets. + * + * If the space between first and now wraps, we clip first to the + * minimum value that avoids a wrap, and clip max to the maximum + * value from the same bucket as now. We then schedule an event to + * fire no later than the start of the next bucket. + * + * However, if there is no wrap between first and now, but only + * between first and max, we clip max at the maximum value that + * avoids a wrap. */ - if (lastb - firstb >= callwheelsize) { - lastb = firstb + callwheelsize - 1; - if (nowb - firstb >= callwheelsize) - nowb = lastb; + if (cc->cc_firstevent - first > callwheelsize * CC_BUCKET_WIDTH) { + if (first + ((callwheelsize - 1) * CC_BUCKET_WIDTH) < + callout_trunc_time(now)) { + /* The space between first and now wraps. */ + first = callout_trunc_time(now) - + ((callwheelsize - 1) * CC_BUCKET_WIDTH); + max = callout_trunc_time(now) + CC_BUCKET_MASK; + cc->cc_firstopt = cc->cc_firstevent = max + 1; + } else { + /* The space between first and last wraps. */ + cc->cc_firstevent = + first + (callwheelsize * CC_BUCKET_WIDTH); + max = cc->cc_firstevent - 1; + if (cc->cc_firstevent < cc->cc_firstopt) + cc->cc_firstopt = cc->cc_firstevent; + } } + KASSERT(first <= now && now <= max && first < max, + ("%s: time mismatch: first=%jd, now=%jd, max=%jd", + __func__, (intmax_t)first, (intmax_t)now, (intmax_t)max)); - /* Iterate callwheel from firstb to nowb and then up to lastb. */ + /* Iterate callwheel from first to now and then up to max. */ + done = false; do { - sc = &cc->cc_callwheel[firstb & callwheelmask]; + firstb = callout_get_bucket(first); + KASSERT(first == callout_trunc_time(first), + ("%s: Unexpected first time (%jx)", __func__, + (intmax_t)first)); + + /* Run high-precision callouts. */ + sc = &cc->cc_callwheel_high[firstb]; tmp = LIST_FIRST(sc); while (tmp != NULL) { /* Run the callout if present time within allowed. */ if (tmp->c_time <= now) { - /* - * Consumer told us the callout may be run - * directly from hardware interrupt context. - */ - if (tmp->c_iflags & CALLOUT_DIRECT) { + tmp = callout_process_single(cc, tmp, + firstb #ifdef CALLOUT_PROFILING - ++depth_dir; + , &depth_dir, &mpcalls_dir, &lockcalls_dir #endif - cc_exec_next(cc) = - LIST_NEXT(tmp, c_links.le); - cc->cc_bucket = firstb & callwheelmask; - LIST_REMOVE(tmp, c_links.le); - softclock_call_cc(tmp, cc, -#ifdef CALLOUT_PROFILING - &mpcalls_dir, &lockcalls_dir, NULL, -#endif - 1); - tmp = cc_exec_next(cc); - cc_exec_next(cc) = NULL; - } else { - tmpn = LIST_NEXT(tmp, c_links.le); - LIST_REMOVE(tmp, c_links.le); - TAILQ_INSERT_TAIL(&cc->cc_expireq, - tmp, c_links.tqe); - tmp->c_iflags |= CALLOUT_PROCESSED; - tmp = tmpn; - } + ); continue; } /* Skip events from distant future. */ - if (tmp->c_time >= max) - goto next; + if (tmp->c_time > max) + goto next_callout; /* * Event minimal time is bigger than present maximal * time, so it cannot be aggregated. */ - if (tmp->c_time > last) { - lastb = nowb; - goto next; + if (tmp->c_time > cc->cc_firstevent) { + done = true; + goto next_callout; } /* Update first and last time, respecting this event. */ - if (tmp->c_time < first) - first = tmp->c_time; - tmp_max = tmp->c_time + tmp->c_precision; - if (tmp_max < last) - last = tmp_max; -next: + if (tmp->c_time < cc->cc_firstopt) + cc->cc_firstopt = tmp->c_time; + if (tmp->c_deadline < cc->cc_firstevent) + cc->cc_firstevent = tmp->c_deadline; +next_callout: tmp = LIST_NEXT(tmp, c_links.le); } + /* + * We only walk the low-precision callout list when it is + * actually time to execute it. If it isn't time to execute + * the bucket yet, we'll simply check to see if it has items + * that need to execute. + */ + sc = &cc->cc_callwheel_low[firstb]; + if (first > now) { + /* + * Rather than walking the items in the bucket, + * we'll just check that there is something in + * the bucket. If so, we'll schedule the + * callout to fire using the bucket's range. + * + * If we find a low-precision bucket, we can + * stop further processing. (Things in later + * buckets will necessarily have later times + * than these.) + */ + if (LIST_FIRST(sc) != NULL) { + if (first < cc->cc_firstopt) + cc->cc_firstopt = first; + if (first + CC_BUCKET_MASK < cc->cc_firstevent) + cc->cc_firstevent = + first + CC_BUCKET_MASK; + done = true; + } + goto next_bucket; + } + /* Run low-precision callouts. */ + tmp = LIST_FIRST(sc); + while (tmp != NULL) { + /* + * Run the callout if present time within allowed. + * Note that callouts scheduled very far in the + * future may not be within range yet. Hence, our + * need to check the time, even for low-precision + * timers. + */ + if (tmp->c_time <= now) { + tmp = callout_process_single(cc, tmp, + firstb +#ifdef CALLOUT_PROFILING + , &depth_dir, &mpcalls_dir, &lockcalls_dir +#endif + ); + } else + tmp = LIST_NEXT(tmp, c_links.le); + } +next_bucket: /* Proceed with the next bucket. */ - firstb++; + first += CC_BUCKET_WIDTH; /* * Stop if we looked after present time and found * some event we can't execute at now. * Stop if we looked far enough into the future. */ - } while (((int)(firstb - lastb)) <= 0); - cc->cc_firstevent = last; + } while (first <= now || (first < max && !done)); + cc->cc_running = false; #ifndef NO_EVENTTIMERS - cpu_new_callout(curcpu, last, first); + cpu_new_callout(curcpu, cc->cc_firstevent, cc->cc_firstopt); #endif #ifdef CALLOUT_PROFILING avg_depth_dir += (depth_dir * 1000 - avg_depth_dir) >> 8; @@ -593,8 +712,6 @@ int bucket; CC_LOCK_ASSERT(cc); - if (sbt < cc->cc_lastscan) - sbt = cc->cc_lastscan; c->c_arg = arg; c->c_iflags |= CALLOUT_PENDING; c->c_iflags &= ~CALLOUT_PROCESSED; @@ -602,26 +719,75 @@ if (flags & C_DIRECT_EXEC) c->c_iflags |= CALLOUT_DIRECT; c->c_func = func; - c->c_time = sbt; + + /* + * Set the time and deadline, but ensure they are no sooner + * than the first callout bucket that callout_process() will + * hit the next time it runs. + */ + if (sbt >= cc->cc_lastscan) + c->c_time = sbt; + else + c->c_time = cc->cc_lastscan; + if (SBT_MAX - sbt < precision) + c->c_deadline = SBT_MAX; + else { + c->c_deadline = sbt + precision; + if (c->c_deadline < cc->cc_lastscan) + c->c_deadline = cc->cc_lastscan; + } c->c_precision = precision; - bucket = callout_get_bucket(c->c_time); + CTR3(KTR_CALLOUT, "precision set for %p: %d.%08x", c, (int)(c->c_precision >> 32), (u_int)(c->c_precision & 0xffffffff)); - LIST_INSERT_HEAD(&cc->cc_callwheel[bucket], c, c_links.le); - if (cc->cc_bucket == bucket) - cc_exec_next(cc) = c; + /* + * If the precision is high enough that the time through the + * deadline (time + precision) completely straddle a low-precisions + * bucket, then use a low-precision bucket. Otherwise, leave it in + * the high-precision buckets. + * + * There are three cases: + * 1. The time is now bucket, and the deadline is after the + * current bucket. In this case, the callout can live + * in the low-precision bucket. + * 2. The time is in a future bucket, but the entire space of a + * low-precision bucket is fully enclosed in the runnable range + * of the callout. In this case, the callout can live in that + * low-precision bucket. + * 3. Any other case, the callout must live in a high-precision + * bucket. + */ + bucket = callout_get_bucket(c->c_time); + sbt = roundup2(c->c_time, CC_BUCKET_WIDTH); + if (c->c_time == cc->cc_lastscan && c->c_deadline >= + callout_trunc_time(cc->cc_lastscan) + CC_BUCKET_MASK) { + c->c_deadline = callout_trunc_time(cc->cc_lastscan) + + CC_BUCKET_MASK; + LIST_INSERT_HEAD(&cc->cc_callwheel_low[bucket], c, c_links.le); + } else if (c->c_deadline >= sbt + CC_BUCKET_MASK) { + c->c_time = sbt; + c->c_deadline = sbt + CC_BUCKET_MASK; + bucket = callout_get_bucket(c->c_time); + LIST_INSERT_HEAD(&cc->cc_callwheel_low[bucket], c, c_links.le); + } else + LIST_INSERT_HEAD(&cc->cc_callwheel_high[bucket], c, c_links.le); #ifndef NO_EVENTTIMERS /* - * Inform the eventtimers(4) subsystem there's a new callout + * If callout_process() is currently running, we only need to + * feed our timings into its calculations. It will schedule the + * timer when done. + * + * Otherwise, inform the eventtimers(4) subsystem there's a new callout * that has been inserted, but only if really required. */ - if (SBT_MAX - c->c_time < c->c_precision) - c->c_precision = SBT_MAX - c->c_time; - sbt = c->c_time + c->c_precision; - if (sbt < cc->cc_firstevent) { - cc->cc_firstevent = sbt; - cpu_new_callout(cpu, sbt, c->c_time); + if (c->c_time < cc->cc_firstopt) + cc->cc_firstopt = c->c_time; + if (c->c_deadline < cc->cc_firstevent) { + cc->cc_firstevent = c->c_deadline; + if (!cc->cc_running) + cpu_new_callout(cpu, cc->cc_firstevent, + cc->cc_firstopt); } #endif } @@ -1027,8 +1193,7 @@ cancelled = 0; if (cpu == -1) { ignore_cpu = 1; - } else if ((cpu >= MAXCPU) || - ((CC_CPU(cpu))->cc_inited == 0)) { + } else if (cpu >= MAXCPU || !(CC_CPU(cpu))->cc_inited) { /* Invalid CPU spec */ panic("Invalid CPU in callout %d", cpu); } @@ -1535,18 +1700,44 @@ return (flsl(sbt)); } +static void +kern_callout_wheel_stats(struct callout_list *sc, int *c, sbintime_t *st, + sbintime_t *spr, sbintime_t *maxt, sbintime_t *maxpr, sbintime_t *now, + int *ct, int *cpr) +{ + struct callout *tmp; + sbintime_t t; + int cnt; + + cnt = 0; + LIST_FOREACH(tmp, sc, c_links.le) { + cnt++; + t = tmp->c_time - *now; + if (t < 0) + t = 0; + *st += t / SBT_1US; + *spr += tmp->c_precision / SBT_1US; + if (t > *maxt) + *maxt = t; + if (tmp->c_precision > *maxpr) + *maxpr = tmp->c_precision; + ct[flssbt(t)]++; + cpr[flssbt(tmp->c_precision)]++; + } + *c += cnt; +} + /* * Dump immediate statistic snapshot of the scheduled callouts. */ static int sysctl_kern_callout_stat(SYSCTL_HANDLER_ARGS) { - struct callout *tmp; struct callout_cpu *cc; - struct callout_list *sc; sbintime_t maxpr, maxt, medpr, medt, now, spr, st, t; int ct[64], cpr[64], ccpbk[32]; int error, val, i, count, tcum, pcum, maxc, c, medc; + int chi, clo, ctmp; #ifdef SMP int cpu; #endif @@ -1567,24 +1758,17 @@ #else cc = CC_CPU(timeout_cpu); #endif + chi = clo = 0; CC_LOCK(cc); for (i = 0; i < callwheelsize; i++) { - sc = &cc->cc_callwheel[i]; c = 0; - LIST_FOREACH(tmp, sc, c_links.le) { - c++; - t = tmp->c_time - now; - if (t < 0) - t = 0; - st += t / SBT_1US; - spr += tmp->c_precision / SBT_1US; - if (t > maxt) - maxt = t; - if (tmp->c_precision > maxpr) - maxpr = tmp->c_precision; - ct[flssbt(t)]++; - cpr[flssbt(tmp->c_precision)]++; - } + kern_callout_wheel_stats(&cc->cc_callwheel_high[i], + &c, &st, &spr, &maxt, &maxpr, &now, ct, cpr); + chi += c; + ctmp = c; + kern_callout_wheel_stats(&cc->cc_callwheel_low[i], + &c, &st, &spr, &maxt, &maxpr, &now, ct, cpr); + clo += c - ctmp; if (c > maxc) maxc = c; ccpbk[fls(c + c / 2)]++; @@ -1592,6 +1776,7 @@ } CC_UNLOCK(cc); #ifdef SMP + printf("CPU %d callouts: %d/%d/%d\n", cpu, clo, chi, clo + chi); } #endif @@ -1650,7 +1835,7 @@ #define C_DB_PRINTF(f, e) db_printf(" %s = " f "\n", #e, c->e); db_printf(" &c_links = %p\n", &(c->c_links)); C_DB_PRINTF("%" PRId64, c_time); - C_DB_PRINTF("%" PRId64, c_precision); + C_DB_PRINTF("%" PRId64, c_deadline); C_DB_PRINTF("%p", c_arg); C_DB_PRINTF("%p", c_func); C_DB_PRINTF("%p", c_lock); Index: sys/sys/_callout.h =================================================================== --- sys/sys/_callout.h +++ sys/sys/_callout.h @@ -54,7 +54,8 @@ SLIST_ENTRY(callout) sle; TAILQ_ENTRY(callout) tqe; } c_links; - sbintime_t c_time; /* ticks to the event */ + sbintime_t c_time; /* ticks to the event target */ + sbintime_t c_deadline; /* absolute deadline */ sbintime_t c_precision; /* delta allowed wrt opt */ void *c_arg; /* function argument */ void (*c_func)(void *); /* function to call */