diff --git a/usr.bin/systat/Makefile b/usr.bin/systat/Makefile --- a/usr.bin/systat/Makefile +++ b/usr.bin/systat/Makefile @@ -4,7 +4,7 @@ .include PROG= systat -SRCS= cmds.c cmdtab.c devs.c fetch.c iostat.c keyboard.c main.c sysput.c \ +SRCS= cmds.c cmdtab.c devs.c fetch.c iolat.c iostat.c keyboard.c main.c sysput.c \ netcmds.c netstat.c pigs.c proc.c swap.c icmp.c \ mode.c ip.c sctp.c tcp.c zarc.c \ vmstat.c convtbl.c ifcmds.c ifstat.c diff --git a/usr.bin/systat/cmdtab.c b/usr.bin/systat/cmdtab.c --- a/usr.bin/systat/cmdtab.c +++ b/usr.bin/systat/cmdtab.c @@ -83,6 +83,9 @@ { "zarc", showzarc, fetchzarc, labelzarc, initzarc, openzarc, closezarc, 0, resetzarc, CF_ZFSARC }, + { "iolat", showiolat, fetchiolat, labeliolat, + initiolat, openiolat, closeiolat, cmdiolat, + 0, CF_LOADAV }, { NULL, NULL, NULL, NULL, NULL, NULL, NULL, 0, 0, 0 } }; struct cmdtab *curcmd = &cmdtab[0]; diff --git a/usr.bin/systat/extern.h b/usr.bin/systat/extern.h --- a/usr.bin/systat/extern.h +++ b/usr.bin/systat/extern.h @@ -88,6 +88,7 @@ void closeswap(WINDOW *); void closetcp(WINDOW *); int cmdifstat(const char *, const char *); +int cmdiolat(const char *, const char *); int cmdiostat(const char *, const char *); int cmdkre(const char *, const char *); int cmdnetstat(const char *, const char *); @@ -188,3 +189,4 @@ SYSTAT_CMD( zarc ); SYSTAT_CMD( sctp ); +SYSTAT_CMD( iolat ); diff --git a/usr.bin/systat/iolat.c b/usr.bin/systat/iolat.c new file mode 100644 --- /dev/null +++ b/usr.bin/systat/iolat.c @@ -0,0 +1,520 @@ +/* + * Copyright (c) 2021 Netflix, Inc + * + * SPDX-License-Identifier: BSD-2-Clause + */ + + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "systat.h" +#include "extern.h" +#include "devs.h" + +#define CAM_BASE "kern.cam" +#define LATENCY ".latencies" +#define CAM_IOSCHED_BASE "kern.cam.iosched.bucket_base_us" + +#define DEV_NAMSIZE 32 +#define OP_NAMSIZE 16 +#define MAX_LATS 32 + +static double high_thresh = 500; +static double med_thresh = 300; +static bool docolor = true; + +static int ndevs; +static SLIST_HEAD(, iosched_stat) curlist; + +struct iosched_op_stat { + int nlats; + uint64_t lats[MAX_LATS]; + uint64_t prev_lats[MAX_LATS]; +}; + +enum { OP_READ = 0, OP_WRITE, OP_TRIM, NUM_OPS }; +static const char *ops[NUM_OPS] = { "read", "write", "trim" }; +#define OP_READ_MASK (1 << OP_READ) +#define OP_WRITE_MASK (1 << OP_WRITE) +#define OP_TRIM_MASK (1 << OP_TRIM) + +static uint32_t flags = OP_READ_MASK | OP_WRITE_MASK | OP_TRIM_MASK; + +struct iosched_stat { + SLIST_ENTRY(iosched_stat) link; + char dev_name[DEV_NAMSIZE]; + int unit; + struct iosched_op_stat op_stats[NUM_OPS]; +}; + +static int name2oid(const char *, int *); +static int walk_sysctl(int *, size_t); + +static int +name2oid(const char *name, int *oidp) +{ + int oid[2]; + int i; + size_t j; + + oid[0] = CTL_SYSCTL; + oid[1] = CTL_SYSCTL_NAME2OID; + + j = CTL_MAXNAME * sizeof(int); + i = sysctl(oid, 2, oidp, &j, name, strlen(name)); + if (i < 0) + return (i); + j /= sizeof(int); + return (j); +} + +static size_t /* Includes the trailing NUL */ +oid2name(int *oid, size_t nlen, char *name, size_t namlen) +{ + int qoid[CTL_MAXNAME + 2]; + int i; + size_t j; + + bzero(name, namlen); + qoid[0] = CTL_SYSCTL; + qoid[1] = CTL_SYSCTL_NAME; + memcpy(qoid + 2, oid, nlen * sizeof(int)); + j = namlen; + i = sysctl(qoid, nlen + 2, name, &j, 0, 0); + if (i || !j) + err(1, "sysctl name %d %zu %d", i, j, errno); + return (j); +} + +static int +oidfmt(int *oid, int len, u_int *kind) +{ + int qoid[CTL_MAXNAME+2]; + u_char buf[BUFSIZ]; + int i; + size_t j; + + qoid[0] = CTL_SYSCTL; + qoid[1] = CTL_SYSCTL_OIDFMT; + memcpy(qoid + 2, oid, len * sizeof(int)); + + j = sizeof(buf); + i = sysctl(qoid, len + 2, buf, &j, 0, 0); + if (i) + err(1, "sysctl fmt %d %zu %d", i, j, errno); + *kind = *(u_int *)buf; + return (0); +} + +static int +split_u64(char *str, const char *delim, uint64_t *buckets, int *nbuckets) +{ + int n = *nbuckets, i; + char *v; + + memset(buckets, 0, n * sizeof(buckets[0])); + for (i = 0; (v = strsep(&str, delim)) != NULL && i < n; i++) { + buckets[i] = strtoull(v, NULL, 10); + } + if (i < n) + *nbuckets = i; + return (i < n); +} + +static double baselat = 0.000020; + +static float +pest(int permill, uint64_t *lats, int nlat) +{ + uint64_t tot, samp; + int i; + float b1, b2; + + for (tot = 0, i = 0; i < nlat; i++) + tot += lats[i]; + if (tot == 0) + return -nanf(""); + if (tot < (uint64_t)2000 / (1000 - permill)) + return nanf(""); + samp = tot * permill / 1000; + if (samp < lats[0]) + return baselat * (float)samp / lats[0]; /* linear interpolation 0 and baselat */ + for (tot = 0, i = 0; samp >= tot && i < nlat; i++) + tot += lats[i]; + i--; + b1 = baselat * (1 << (i - 1)); + b2 = baselat * (1 << i); + /* Should expoentially interpolate between buckets -- doing linear instead */ + return b1 + (b2 - b1) * (float)(lats[i] - (tot - samp)) / lats[i]; +} + +static int +op2num(const char *op) +{ + for (int i = 0; i < NUM_OPS; i++) + if (strcmp(op, ops[i]) == 0) + return i; + return -1; +} + +static struct iosched_op_stat * +find_dev(const char *dev, int unit, int op) +{ + struct iosched_stat *isp; + struct iosched_op_stat *iosp; + + SLIST_FOREACH(isp, &curlist, link) { + if (strcmp(isp->dev_name, dev) != 0 || isp->unit != unit) + continue; + iosp = &isp->op_stats[op]; + return iosp; + } + return NULL; +} + +static struct iosched_op_stat * +alloc_dev(const char *dev, int unit, int op) +{ + struct iosched_stat *isp; + struct iosched_op_stat *iosp; + + isp = malloc(sizeof(*isp)); + if (isp == NULL) + return NULL; + strlcpy(isp->dev_name, dev, sizeof(isp->dev_name)); + isp->unit = unit; + SLIST_INSERT_HEAD(&curlist, isp, link); + ndevs++; + iosp = &isp->op_stats[op]; + return iosp; +} + +#define E3 1000.0 +static void +update_dev(const char *dev, int unit, int op, uint64_t *lats, int nlat) +{ + struct iosched_op_stat *iosp; + + iosp = find_dev(dev, unit, op); + if (iosp == NULL) + iosp = alloc_dev(dev, unit, op); + if (iosp == NULL) + return; + iosp->nlats = nlat; + memcpy(iosp->prev_lats, iosp->lats, iosp->nlats * sizeof(uint64_t)); + memcpy(iosp->lats, lats, iosp->nlats * sizeof(uint64_t)); +// printf("%s%d: %-6s %.3f %.3f %.3f %.3f\r\n", +// dev, unit, operation, E3 * pest(500, lats, nlat), E3 * pest(900, lats, nlat), +// E3 * pest(990, lats, nlat), E3 * pest(999, lats, nlat)); +} + +static int +walk_sysctl(int *base_oid, size_t len) +{ + int qoid[CTL_MAXNAME + 2], oid[CTL_MAXNAME]; + size_t l1, l2; + char name[BUFSIZ]; + + if (len > CTL_MAXNAME) + err(1, "Length %zd too long", len); + + qoid[0] = CTL_SYSCTL; + qoid[1] = CTL_SYSCTL_NEXT; + l1 = 2; + memcpy(qoid + 2, base_oid, len * sizeof(int)); + l1 += len; + for (;;) { + /* + * Get the next one or return when we get to the end of the + * sysctls in the kernel. + */ + l2 = sizeof(oid); + if (sysctl(qoid, l1, oid, &l2, 0, 0) != 0) { + if (errno == ENOENT) + return (0); + err(1, "sysctl(getnext) %zu", l2); + } + + l2 /= sizeof(int); + + /* + * Bail if we're seeing OIDs that don't have the + * same prefix or can't have the same prefix. + */ + if (l2 < len || + memcmp(oid, base_oid, len * sizeof(int)) != 0) + return (0); + + /* + * Get the name, validate it's one we're looking for, + * parse the latency and add to list. + */ + do { + int nlat; + size_t l3; + char val[BUFSIZ]; + char *walker, *dev, *opstr; + uint64_t latvals[MAX_LATS]; + u_int kind; + int unit, op; + + l1 = oid2name(oid, l2, name, sizeof(name)); + if (strcmp(name + l1 - strlen(LATENCY) - 1, LATENCY) != 0) + break; + if (oidfmt(oid, l2, &kind) != 0) + err(1, "oidfmt"); + if ((kind & CTLTYPE) != CTLTYPE_STRING) + errx(1, "string"); + l3 = sizeof(val); + if (sysctl(oid, l2, val, &l3, 0, 0) != 0) + err(1, "sysctl"); + val[l3] = '\0'; + nlat = nitems(latvals); + if (split_u64(val, ",", latvals, &nlat) == 0) + break; + walker = name + strlen(CAM_BASE) + 1; + dev = strsep(&walker, "."); + unit = (int)strtol(strsep(&walker, "."), NULL, 10); + strsep(&walker, "."); + opstr = strsep(&walker, "."); + op = op2num(opstr); + if (op < 0) + break; + update_dev(dev, unit, op, latvals, nlat); + } while (false); + + memcpy(qoid + 2, oid, l2 * sizeof(int)); + l1 = 2 + l2; + } +} + +void +closeiolat(WINDOW *w) +{ + if (w == NULL) + return; + wclear(w); + wrefresh(w); + delwin(w); +} + +static void +doublecmd(const char *cmd, double *v) +{ + const char *p; + double tv; + + p = strchr(cmd, '='); + if (p == NULL) + return; /* XXX Tell the user something? */ + if (sscanf(p + 1, "%lf", &tv) != 1) + return; /* XXX Tell the user something? */ + *v = tv; +} + +int +cmdiolat(const char *cmd __unused, const char *args __unused) +{ + fprintf(stderr, "CMD IS '%s'\n\n", cmd); + if (prefix(cmd, "trim")) + flags ^= OP_TRIM_MASK; + else if (prefix(cmd, "read")) + flags ^= OP_READ_MASK; + else if (prefix(cmd, "write")) + flags ^= OP_WRITE_MASK; + else if (prefix(cmd, "color")) + docolor = !docolor; + else if (prefix("high", cmd)) + doublecmd(cmd, &high_thresh); + else if (prefix("med", cmd)) + doublecmd(cmd, &med_thresh); + else + return (0); + wclear(wnd); + labeliolat(); + refresh(); + return (1); +} + +int +initiolat(void) +{ + int cam[CTL_MAXNAME]; + uint64_t sbt_base; + size_t len = sizeof(sbt_base); + + SLIST_INIT(&curlist); + + baselat = 1e-3; /* old default */ + if (sysctlbyname(CAM_IOSCHED_BASE, &sbt_base, &len, NULL, 0) == 0) + baselat = sbt_base * 1e-6; /* Convert to microseconds */ + + name2oid(CAM_BASE, cam); + walk_sysctl(cam, 2); + return (1); +} + +void +fetchiolat(void) +{ + int cam[CTL_MAXNAME]; + + name2oid(CAM_BASE, cam); + walk_sysctl(cam, 2); +} + +#define INSET 10 + +void +labeliolat(void) +{ + int _col, ndrives, lpr, row, j; + int regions __unused; + struct iosched_stat *isp; + char tmpstr[32]; +#define COLWIDTH 29 +#define DRIVESPERLINE ((getmaxx(wnd) - 1 - INSET) / COLWIDTH) + ndrives = ndevs; // XXX FILTER XXX + regions = howmany(ndrives, DRIVESPERLINE); + lpr = 2; /* for headers */ + for (int i = 0; i < NUM_OPS; i++) { + if (flags & (1 << i)) + lpr++; + } + row = 0; + _col = INSET; + j = 2; + if (flags & OP_READ_MASK) + mvwaddstr(wnd, row + j++, 1, "read"); + if (flags & OP_WRITE_MASK) + mvwaddstr(wnd, row + j++, 1, "write"); + if (flags & OP_TRIM_MASK) + mvwaddstr(wnd, row + j++, 1, "trim"); + SLIST_FOREACH(isp, &curlist, link) { + if (_col + COLWIDTH >= getmaxx(wnd) - 1 - INSET) { + _col = INSET; + row += lpr + 1; + if (row > getmaxy(wnd) - 1 - (lpr + 1)) + break; + j = 2; + if (flags & OP_READ_MASK) + mvwaddstr(wnd, row + j++, 1, "read"); + if (flags & OP_WRITE_MASK) + mvwaddstr(wnd, row + j++, 1, "write"); + if (flags & OP_TRIM_MASK) + mvwaddstr(wnd, row + j++, 1, "trim"); + } + snprintf(tmpstr, sizeof(tmpstr), "%s%d", isp->dev_name, isp->unit); + mvwaddstr(wnd, row, _col + (COLWIDTH - strlen(tmpstr)) / 2, tmpstr); + mvwaddstr(wnd, row + 1, _col, " p50 p90 p99 p99.9"); + _col += COLWIDTH; + } +} + +WINDOW * +openiolat(void) +{ + return (subwin(stdscr, LINES-3-1, 0, MAINWIN_ROW, 0)); +} + +static void +fmt(float f, char *buf, size_t len) +{ + if (isnan(f)) + strlcpy(buf, " - ", len); + else if (f >= 1000.0) + snprintf(buf, len, "%6d", (int)f); + else if (f >= 100.0) + snprintf(buf, len, "%6.1f", f); + else if (f >= 10.0) + snprintf(buf, len, "%6.2f", f); + else + snprintf(buf, len, "%6.3f", f); +} + +static void +latout(double lat, int y, int x) +{ + int i; + char tmpstr[32]; + + fmt(lat, tmpstr, sizeof(tmpstr)); + if (isnan(lat)) + i = 4; + else if (lat > high_thresh) + i = 3; + else if (lat > med_thresh) + i = 2; + else + i = 1; + if (docolor) + wattron(wnd, COLOR_PAIR(i)); + mvwaddstr(wnd, y, x, tmpstr); + if (docolor) + wattroff(wnd, COLOR_PAIR(i)); +} + +void +showiolat(void) +{ + int _col, ndrives, lpr, row, k; + int regions __unused; + struct iosched_stat *isp; + struct iosched_op_stat *iosp; +#define COLWIDTH 29 +#define DRIVESPERLINE ((getmaxx(wnd) - 1 - INSET) / COLWIDTH) + ndrives = ndevs; // XXX FILTER XXX + regions = howmany(ndrives, DRIVESPERLINE); + lpr = 2; /* XXX */ + for (int i = 0; i < NUM_OPS; i++) { + if (flags & (1 << i)) + lpr++; + } + row = 0; + _col = INSET; + SLIST_FOREACH(isp, &curlist, link) { + if (_col + COLWIDTH >= getmaxx(wnd) - 1 - INSET) { + _col = INSET; + row += lpr + 1; + if (row > getmaxy(wnd) - 1 - (lpr + 1)) + break; + } + k = 2; + for (int i = 0; i < NUM_OPS; i++) { + uint64_t lats[MAX_LATS]; + int nlats; + float p50, p90, p99, p999; + + if ((flags & (1 << i)) == 0) + continue; + iosp = &isp->op_stats[i]; + nlats = iosp->nlats; + memset(lats, 0, sizeof(lats)); + for (int j = 0; j < iosp->nlats; j++) + lats[j] = iosp->lats[j] - iosp->prev_lats[j]; + p50 = pest(500, lats, nlats) * E3; + p90 = pest(900, lats, nlats) * E3; + p99 = pest(990, lats, nlats) * E3; + p999 = pest(999, lats, nlats) * E3; + latout(p50, row + k, _col); + latout(p90, row + k, _col + 7); + latout(p99, row + k, _col + 14); + latout(p999, row + k, _col + 21); + k++; + } + _col += COLWIDTH; + } +} diff --git a/usr.bin/systat/main.c b/usr.bin/systat/main.c --- a/usr.bin/systat/main.c +++ b/usr.bin/systat/main.c @@ -155,6 +155,7 @@ char errbuf[_POSIX2_LINE_MAX], dummy; size_t size; struct cmdentry *cmd = NULL; + short cf, cb; (void) setlocale(LC_ALL, ""); @@ -214,6 +215,13 @@ * routines to minimize update work by curses. */ initscr(); + start_color(); + use_default_colors(); + pair_content(0, &cf, &cb); + init_pair(1, COLOR_GREEN, cb); + init_pair(2, COLOR_MAGENTA, cb); + init_pair(3, COLOR_RED, cb); + init_pair(4, COLOR_BLUE, cb); CMDLINE = LINES - 1; wnd = (*curcmd->c_open)(); if (wnd == NULL) { diff --git a/usr.bin/systat/systat.1 b/usr.bin/systat/systat.1 --- a/usr.bin/systat/systat.1 +++ b/usr.bin/systat/systat.1 @@ -88,6 +88,7 @@ .Ic icmp , .Ic icmp6 , .Ic ifstat , +.Ic iolat , .Ic iostat , .Ic ip , .Ic ip6 , @@ -244,6 +245,59 @@ Like .Ic icmp , but with TCP statistics. +.It Ic iolat +Display statistics describing the hardware latencies of I/O operations as +computed by the +.Va CAM_IOSCHED_DYNAMIC +option. +This option must be in the kernel config file of the running kernel for this +display to work. +All devices are displayed as there is currently no way to filter them. +The statistics displayed for the I/O latencies are the percentiles with +sufficient data during the polling interval to compute. +If a value cannot be estimated ``-'' is displayed. +The P50 (also known as the median), P90, P99 and P99.9 values are computed if +more than 2, 10, 100 or 1000 operations occurred during the polling interval. +The latency is the hardware latency values, and does not include any software +queueing time. +The latencies are estimated based on histogram data computed by the CAM I/O +scheduler and represent estimates of the actual value that are only good to +two or three significant digits. +The display of latency changes based on the scale of the latency to reflect +the precision of the estimates and to fit on the available screen space. +All latencies are reported in milliseconds. +When color is enabled +.Bl -bullet +.It Values below the medium latency threshold are displayed in green. +.It Values between the minimum latency and high latency thresholds are displayed +in magenta. +.It Values above the high latency thresholds are displayed in red. +.Pp +When color is disabled, the default foreground and background colors are always +used. +.Pp +The following commands are specific to the +.Ic iolat +display; the minimum unambiguous prefix may be supplied. +.Pp +.Bl -tag -width Fl -compact +.It Cm color +Toggle the use of color in the display. +The default is on. +.It Cm hi=XXX +Set the high latency threshold to XXX milliseconds. +.It Cm med=XXX +Set the medium latency threshold to XXX milliseconds. +.It Cm read +Toggle the display of statistics about read operations. +The default is on. +.It Cm write +Toggle the display of statistics about write operations. +The default is on. +.It Cm trim +Toggle the display of statistics about trim operations. +The default is on. +.El .It Ic iostat Display, in the lower window, statistics about processor use and disk throughput. @@ -682,3 +736,9 @@ .Ic vmstat display looks out of place because it is (it was added in as a separate display rather than created as a new program). +The +.Ic iolat +command doesn't implement the common device commands including +filtering, as it doesn't use the +.Xr devstat 3 +mechanism to obtain its statistics.