Index: etc/defaults/rc.conf =================================================================== --- etc/defaults/rc.conf +++ etc/defaults/rc.conf @@ -682,6 +682,9 @@ iovctl_files="" # Config files for iovctl(8) +irqrebalance_enable="NO" # Balance IRQs dynamically +irqrebalance_period="" # Rebalance period (seconds) (defaults to 60) + ############################################################## ### Jail Configuration (see rc.conf(5) manual page) ########## ############################################################## Index: etc/rc.d/irqrebalance =================================================================== --- /dev/null +++ etc/rc.d/irqrebalance @@ -0,0 +1,31 @@ +#!/bin/sh +# +# $FreeBSD$ +# + +# PROVIDE: irqrebalance + +. /etc/rc.subr + +name="irqrebalance" +desc="Dynamically rebalance interrupts across cores depending on load" +rcvar="irqrebalance_enable" +command="/libexec/${name}" +start_precmd="irqrebalance_precmd" +command_args="&" + +irqrebalance_precmd() +{ + + case "${irqrebalance_period}" in + '') + rc_flags="-f 60" + ;; + *) + rc_flags="-f ${irqrebalance_period}" + ;; + esac +} + +load_rc_config $name +run_rc_command "$1" Index: libexec/Makefile =================================================================== --- libexec/Makefile +++ libexec/Makefile @@ -9,6 +9,7 @@ ${_comsat} \ ${_dma} \ getty \ + ${_irqrebalance} \ ${_mail.local} \ ${_makewhatis.local} \ ${_mknetid} \ @@ -58,6 +59,10 @@ _dma= dma .endif +.if ${MK_IRQREBALANCE} != "no" +_irqrebalance= irqrebalance +.endif + .if ${MK_NIS} != "no" _mknetid= mknetid _ypxfr= ypxfr Index: libexec/irqrebalance/Makefile =================================================================== --- /dev/null +++ libexec/irqrebalance/Makefile @@ -0,0 +1,9 @@ +# $FreeBSD$ + +.include + +PROG= irqrebalance +MAN= +SRCS= irqrebalance.c + +.include Index: libexec/irqrebalance/irqrebalance.c =================================================================== --- /dev/null +++ libexec/irqrebalance/irqrebalance.c @@ -0,0 +1,222 @@ +/* + * Copyright (c) 2017 Dell EMC Isilon + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD $ + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +struct intr_src { + const char *is_name; + unsigned long is_count; + int is_irq; +}; + +static cpuset_t cpus; +static struct intr_src *intr_sources; +static char *intrnames; + +static void enumerate_irqs(void); +static void irqshuffle(void); + +/* + * irqrebalance options: + * -f period + * Rebalance every N seconds. + */ +int +main(int argc, char **argv) +{ + int c; + time_t period; + + while ((c = getopt(argc, argv, "f:")) != -1) { + switch (c) { + case 'f': + period = atol(optarg); + if (period <= 0) + return (1); + break; + default: + return (1); + } + } + + enumerate_irqs(); + while (true) { + irqshuffle(); + sleep(period); + } + /* NOTREACHED */ + return (0); +} + +static size_t +read_intrcnts(unsigned long **intrcnts) +{ + size_t intrcntlen; + int rc; + + for (*intrcnts = NULL, intrcntlen = 1024; ; intrcntlen *= 2) { + *intrcnts = reallocf(*intrcnts, intrcntlen); + if (*intrcnts == NULL) + err(1, "reallocf"); + rc = sysctlbyname("hw.intrcnt", *intrcnts, &intrcntlen, NULL, 0); + if (rc == 0) + break; + else if (rc != ENOMEM) + err(1, "sysctl"); + } + + return (intrcntlen / sizeof(unsigned long)); +} + +static void +enumerate_irqs(void) +{ + size_t inamlen, intrcnt, i; + unsigned long *counts; + const char *name; + int rc; + + for (intrnames = NULL, inamlen = 1024; ; inamlen *= 2) { + if ((intrnames = reallocf(intrnames, inamlen)) == NULL) + err(1, "reallocf"); + rc = sysctlbyname("hw.intrnames", intrnames, &inamlen, NULL, 0); + if (rc == 0) + break; + else if (rc != ENOMEM) + err(1, "sysctl"); + } + + intrcnt = read_intrcnts(&counts); + free(counts); + + intr_sources = calloc(intrcnt, sizeof(*intr_sources)); + if (intr_sources == NULL) + err(1, "calloc"); + + for (i = 0, name = intrnames; i < intrcnt; i++) { + if (name[0] != '\0') + intr_sources[i].is_name = strdup(name); + /* XXX */ + if (strncmp(name, "irq", 3) != 0) { + rc = sscanf(name, "irq%d:", &intr_sources[i].is_irq); + if (rc < 1) + intr_sources[i].is_irq = -1; + } else + intr_sources[i].is_irq = -1; + name += strlen(name) + 1; + } + + rc = cpuset_getaffinity(CPU_LEVEL_ROOT, CPU_WHICH_CPUSET, -1, + sizeof(cpus), &cpus); + if (rc != 0) + err(1, "cpuset_getaffinity"); +} + +static int +intrcmp(const void *one, const void *two) +{ + const struct intr_src *i1, *i2; + + i1 = one; + i2 = two; + if (i1->is_count != 0 && i2->is_count != 0) { + if (i1->is_count < i2->is_count) + return (-1); + else if (i1->is_count == i2->is_count) + return (0); + return (1); + } + + if (i1->is_count != 0) + return (1); + else if (i2->is_count != 0) + return (-1); + return (0); +} + +static size_t +nextcpu(const cpuset_t *allcpus, size_t idx, cpuset_t *output) +{ + + CPU_ZERO(output); + while (true) { + if (CPU_ISSET(idx, allcpus)) { + CPU_SET(idx, output); + break; + } + idx = (idx + 1) % CPU_SETSIZE; + } + return ((idx + 1) % CPU_SETSIZE); +} + +static void +irqshuffle(void) +{ + const struct intr_src *isrc; + unsigned long *intrcnts; + size_t current_cpu; + ssize_t i, nintrs; + cpuset_t mask; + int rc; + + nintrs = read_intrcnts(&intrcnts); + + for (i = 0; i < nintrs; i++) + intr_sources[i].is_count = intrcnts[i]; + + qsort(intr_sources, nintrs, sizeof(*intr_sources), intrcmp); + + /* + * Scan from the same location to avoid moving in the common case. + */ + current_cpu = 0; + + for (i = nintrs - 1; i >= 0; i--) { + isrc = &intr_sources[i]; + current_cpu = nextcpu(&cpus, current_cpu, &mask); + + if (isrc->is_irq < 0) + continue; + + /* XXX Differentiate managed and unmanaged irqs? */ + rc = cpuset_setaffinity(CPU_LEVEL_WHICH, CPU_WHICH_IRQ_ONLY, + isrc->is_irq, sizeof(mask), &mask); + if (rc != 0) + err(1, "cpuset_setaffinity"); + } +} Index: share/mk/src.opts.mk =================================================================== --- share/mk/src.opts.mk +++ share/mk/src.opts.mk @@ -112,6 +112,7 @@ INETD \ IPFILTER \ IPFW \ + IRQREBALANCE \ ISCSI \ JAIL \ KDUMP \