diff --git a/sys/net/iflib.c b/sys/net/iflib.c --- a/sys/net/iflib.c +++ b/sys/net/iflib.c @@ -195,6 +195,8 @@ uint16_t ifc_sysctl_core_offset; #define CORE_OFFSET_UNSPECIFIED 0xffff uint8_t ifc_sysctl_separate_txrx; + uint8_t ifc_sysctl_use_logical_cores; + bool ifc_cpus_are_physical_cores; qidx_t ifc_sysctl_ntxds[8]; qidx_t ifc_sysctl_nrxds[8]; @@ -725,7 +727,7 @@ SLIST_ENTRY(cpu_offset) entries; cpuset_t set; unsigned int refcount; - uint16_t offset; + uint16_t next_cpuid; }; static struct mtx cpu_offset_mtx; MTX_SYSINIT(iflib_cpu_offset, &cpu_offset_mtx, "iflib_cpu_offset lock", @@ -4682,41 +4684,291 @@ pfil_head_unregister(pfil); } + +/* + * Advance forward by n members of the cpuset ctx->ifc_cpus starting from + * cpuid and wrapping as necessary. + */ +static unsigned int +cpuid_advance(if_ctx_t ctx, unsigned int cpuid, unsigned int n) +{ + unsigned int first_valid; + unsigned int last_valid; + + /* cpuid should always be in the valid set */ + MPASS(CPU_ISSET(cpuid, &ctx->ifc_cpus)); + + /* valid set should never be empty */ + MPASS(!CPU_EMPTY(&ctx->ifc_cpus)); + + first_valid = CPU_FFS(&ctx->ifc_cpus) - 1; + last_valid = CPU_FLS(&ctx->ifc_cpus) - 1; + n = n % CPU_COUNT(&ctx->ifc_cpus); + while (n > 0) { + do { + cpuid++; + if (cpuid > last_valid) + cpuid = first_valid; + } while (!CPU_ISSET(cpuid, &ctx->ifc_cpus)); + n--; + } + + return (cpuid); +} + +#if defined(SMP) && defined(SCHED_ULE) +extern struct cpu_group *cpu_top; /* CPU topology */ + +static int +find_child_with_core(int cpu, struct cpu_group *grp) +{ + int i; + + if (grp->cg_children == 0) + return -1; + + MPASS(grp->cg_child); + for (i = 0; i < grp->cg_children; i++) { + if (CPU_ISSET(cpu, &grp->cg_child[i].cg_mask)) + return i; + } + + return -1; +} + + +/* + * Find an L2 neighbor of the given CPU or return -1 if none found. This + * does not distinguish among multiple L2 neighbors if the given CPU has + * more than one (it will always return the same result in that case). + */ +static int +find_l2_neighbor(int cpu) +{ + struct cpu_group *grp; + int i; + + grp = cpu_top; + if (grp == NULL) + return -1; + + /* + * Find the smallest CPU group that contains the given core. + */ + i = 0; + while ((i = find_child_with_core(cpu, grp)) != -1) { + /* + * If the smallest group containing the given CPU has less + * than two members, we conclude the given CPU has no + * L2 neighbor. + */ + if (grp->cg_child[i].cg_count <= 1) + return (-1); + grp = &grp->cg_child[i]; + } + + /* Must share L2. */ + if (grp->cg_level > CG_SHARE_L2 || grp->cg_level == CG_SHARE_NONE) + return -1; + + /* + * Select the first member of the set that isn't the reference + * CPU, which at this point is guaranteed to exist. + */ + for (i = 0; i < CPU_SETSIZE; i++) { + if (CPU_ISSET(i, &grp->cg_mask) && i != cpu) + return (i); + } + + /* Should never be reached */ + return (-1); +} + +#else +static int +find_l2_neighbor(int cpu) +{ + + return (-1); +} +#endif + +/* + * CPU mapping behaviors + * --------------------- + * 'separate txrx' refers to the separate_txrx sysctl + * 'use logical' refers to the use_logical_cores sysctl + * 'INTR CPUS' indicates whether bus_get_cpus(INTR_CPUS) succeeded + * + * separate use INTR + * txrx logical CPUS result + * ---------- --------- ------ ------------------------------------------------ + * - - X RX and TX queues mapped to consecutive physical + * cores with RX/TX pairs on same core and excess + * of either following + * - X X RX and TX queues mapped to consecutive cores + * of any type with RX/TX pairs on same core and + * excess of either following + * X - X RX and TX queues mapped to consecutive physical + * cores; all RX then all TX + * X X X RX queues mapped to consecutive physical cores + * first, then TX queues mapped to L2 neighbor of + * the corresponding RX queue if one exists, + * otherwise to consecutive physical cores + * - n/a - RX and TX queues mapped to consecutive cores of + * any type with RX/TX pairs on same core and excess + * of either following + * X n/a - RX and TX queues mapped to consecutive cores of + * any type; all RX then all TX + */ +static unsigned int +get_cpuid_for_queue(if_ctx_t ctx, unsigned int base_cpuid, unsigned int qid, + bool is_tx) +{ + if_softc_ctx_t scctx = &ctx->ifc_softc_ctx; + unsigned int core_index; + + if (ctx->ifc_sysctl_separate_txrx) { + /* + * When using separate CPUs for TX and RX, the assignment + * will always be of a consecutive CPU out of the set of + * context CPUs, except for the specific case where the + * context CPUs are phsyical cores, the use of logical cores + * has been enabled, the assignment is for TX, the TX qid + * corresponds to an RX qid, and the CPU assigned to the + * corresponding RX queue has an L2 neighbor. + */ + if (ctx->ifc_sysctl_use_logical_cores && + ctx->ifc_cpus_are_physical_cores && + is_tx && qid < scctx->isc_nrxqsets) { + int l2_neighbor; + unsigned int rx_cpuid; + + rx_cpuid = cpuid_advance(ctx, base_cpuid, qid); + l2_neighbor = find_l2_neighbor(rx_cpuid); + if (l2_neighbor != -1) { + return (l2_neighbor); + } + /* + * ... else fall through to the normal + * consecutive-after-RX assignment scheme. + * + * Note that we are assuming that all RX queue CPUs + * have an L2 neighbor, or all do not. If a mixed + * scenario is possible, we will have to keep track + * separately of how many queues prior to this one + * were not able to be assigned to an L2 neighbor. + */ + } + if (is_tx) + core_index = scctx->isc_nrxqsets + qid; + else + core_index = qid; + } else { + core_index = qid; + } + + return (cpuid_advance(ctx, base_cpuid, core_index)); +} + static uint16_t get_ctx_core_offset(if_ctx_t ctx) { if_softc_ctx_t scctx = &ctx->ifc_softc_ctx; struct cpu_offset *op; - uint16_t qc; - uint16_t ret = ctx->ifc_sysctl_core_offset; + cpuset_t assigned_cpus; + unsigned int cores_consumed; + unsigned int base_cpuid = ctx->ifc_sysctl_core_offset; + unsigned int first_valid; + unsigned int last_valid; + unsigned int i; - if (ret != CORE_OFFSET_UNSPECIFIED) - return (ret); + first_valid = CPU_FFS(&ctx->ifc_cpus) - 1; + last_valid = CPU_FLS(&ctx->ifc_cpus) - 1; - if (ctx->ifc_sysctl_separate_txrx) - qc = scctx->isc_ntxqsets + scctx->isc_nrxqsets; - else - qc = max(scctx->isc_ntxqsets, scctx->isc_nrxqsets); + if (base_cpuid != CORE_OFFSET_UNSPECIFIED) { + /* + * Align the user-chosen base CPU ID to the next valid CPU + * for this device. If the chosen base CPU ID is smaller + * than the first valid CPU or larger than the last valid + * CPU, we assume the user does not know what the valid + * range is for this device and is thinking in terms of a + * zero-based reference frame, and so we shift the given + * value into the valid range (and wrap accordingly) so the + * intent is translated to the proper frame of reference. + * If the base CPU ID is within the valid first/last, but + * does not correspond to a valid CPU, it is advanced to the + * next valid CPU (wrapping if necessary). + */ + if (base_cpuid < first_valid || base_cpuid > last_valid) { + /* shift from zero-based to first_valid-based */ + base_cpuid += first_valid; + /* wrap to range [first_valid, last_valid] */ + base_cpuid = (base_cpuid - first_valid) % + (last_valid - first_valid + 1); + } + if (!CPU_ISSET(base_cpuid, &ctx->ifc_cpus)) { + /* + * base_cpuid is in [first_valid, last_valid], but + * not a member of the valid set. In this case, + * there will always be a member of the valid set + * with a CPU ID that is greater than base_cpuid, + * and we simply advance to it. + */ + while (!CPU_ISSET(base_cpuid, &ctx->ifc_cpus)) + base_cpuid++; + } + return (base_cpuid); + } + + /* + * Determine how many cores will be consumed by performing the CPU + * assignments and counting how many of the assigned CPUs correspond + * to CPUs in the set of context CPUs. This is done using the CPU + * ID first_valid as the base CPU ID, as the base CPU must be within + * the set of context CPUs. + * + * Note not all assigned CPUs will be in the set of context CPUs + * when separate CPUs are being allocated to TX and RX queues, + * assignment to logical cores has been enabled, the set of context + * CPUs contains only physical CPUs, and TX queues are mapped to L2 + * neighbors of CPUs that RX queues have been mapped to - in this + * case we do only want to count how many CPUs in the set of context + * CPUs have been consumed, as that determines the next CPU in that + * set to start allocating at for the next device for which + * core_offset is not set. + */ + CPU_ZERO(&assigned_cpus); + for (i = 0; i < scctx->isc_ntxqsets; i++) + CPU_SET(get_cpuid_for_queue(ctx, first_valid, i, true), + &assigned_cpus); + for (i = 0; i < scctx->isc_nrxqsets; i++) + CPU_SET(get_cpuid_for_queue(ctx, first_valid, i, false), + &assigned_cpus); + CPU_AND(&assigned_cpus, &ctx->ifc_cpus); + cores_consumed = CPU_COUNT(&assigned_cpus); mtx_lock(&cpu_offset_mtx); SLIST_FOREACH(op, &cpu_offsets, entries) { if (CPU_CMP(&ctx->ifc_cpus, &op->set) == 0) { - ret = op->offset; - op->offset += qc; + base_cpuid = op->next_cpuid; + op->next_cpuid = cpuid_advance(ctx, op->next_cpuid, + cores_consumed); MPASS(op->refcount < UINT_MAX); op->refcount++; break; } } - if (ret == CORE_OFFSET_UNSPECIFIED) { - ret = 0; + if (base_cpuid == CORE_OFFSET_UNSPECIFIED) { + base_cpuid = first_valid; op = malloc(sizeof(struct cpu_offset), M_IFLIB, M_NOWAIT | M_ZERO); if (op == NULL) { device_printf(ctx->ifc_dev, "allocation for cpu offset failed.\n"); } else { - op->offset = qc; + op->next_cpuid = cpuid_advance(ctx, base_cpuid, + cores_consumed); op->refcount = 1; CPU_COPY(&ctx->ifc_cpus, &op->set); SLIST_INSERT_HEAD(&cpu_offsets, op, entries); @@ -4724,7 +4976,7 @@ } mtx_unlock(&cpu_offset_mtx); - return (ret); + return (base_cpuid); } static void @@ -4855,7 +5107,9 @@ if (bus_get_cpus(dev, INTR_CPUS, sizeof(ctx->ifc_cpus), &ctx->ifc_cpus) != 0) { device_printf(dev, "Unable to fetch CPU list\n"); CPU_COPY(&all_cpus, &ctx->ifc_cpus); - } + ctx->ifc_cpus_are_physical_cores = false; + } else + ctx->ifc_cpus_are_physical_cores = true; MPASS(CPU_COUNT(&ctx->ifc_cpus) > 0); /* @@ -5970,128 +6224,6 @@ return (_iflib_irq_alloc(ctx, irq, rid, filter, handler, arg, name)); } -#ifdef SMP -static int -find_nth(if_ctx_t ctx, int qid) -{ - cpuset_t cpus; - int i, cpuid, eqid, count; - - CPU_COPY(&ctx->ifc_cpus, &cpus); - count = CPU_COUNT(&cpus); - eqid = qid % count; - /* clear up to the qid'th bit */ - for (i = 0; i < eqid; i++) { - cpuid = CPU_FFS(&cpus); - MPASS(cpuid != 0); - CPU_CLR(cpuid-1, &cpus); - } - cpuid = CPU_FFS(&cpus); - MPASS(cpuid != 0); - return (cpuid-1); -} - -#ifdef SCHED_ULE -extern struct cpu_group *cpu_top; /* CPU topology */ - -static int -find_child_with_core(int cpu, struct cpu_group *grp) -{ - int i; - - if (grp->cg_children == 0) - return -1; - - MPASS(grp->cg_child); - for (i = 0; i < grp->cg_children; i++) { - if (CPU_ISSET(cpu, &grp->cg_child[i].cg_mask)) - return i; - } - - return -1; -} - -/* - * Find the nth "close" core to the specified core - * "close" is defined as the deepest level that shares - * at least an L2 cache. With threads, this will be - * threads on the same core. If the shared cache is L3 - * or higher, simply returns the same core. - */ -static int -find_close_core(int cpu, int core_offset) -{ - struct cpu_group *grp; - int i; - int fcpu; - cpuset_t cs; - - grp = cpu_top; - if (grp == NULL) - return cpu; - i = 0; - while ((i = find_child_with_core(cpu, grp)) != -1) { - /* If the child only has one cpu, don't descend */ - if (grp->cg_child[i].cg_count <= 1) - break; - grp = &grp->cg_child[i]; - } - - /* If they don't share at least an L2 cache, use the same CPU */ - if (grp->cg_level > CG_SHARE_L2 || grp->cg_level == CG_SHARE_NONE) - return cpu; - - /* Now pick one */ - CPU_COPY(&grp->cg_mask, &cs); - - /* Add the selected CPU offset to core offset. */ - for (i = 0; (fcpu = CPU_FFS(&cs)) != 0; i++) { - if (fcpu - 1 == cpu) - break; - CPU_CLR(fcpu - 1, &cs); - } - MPASS(fcpu); - - core_offset += i; - - CPU_COPY(&grp->cg_mask, &cs); - for (i = core_offset % grp->cg_count; i > 0; i--) { - MPASS(CPU_FFS(&cs)); - CPU_CLR(CPU_FFS(&cs) - 1, &cs); - } - MPASS(CPU_FFS(&cs)); - return CPU_FFS(&cs) - 1; -} -#else -static int -find_close_core(int cpu, int core_offset __unused) -{ - return cpu; -} -#endif - -static int -get_core_offset(if_ctx_t ctx, iflib_intr_type_t type, int qid) -{ - switch (type) { - case IFLIB_INTR_TX: - /* TX queues get cores which share at least an L2 cache with the corresponding RX queue */ - /* XXX handle multiple RX threads per core and more than two core per L2 group */ - return qid / CPU_COUNT(&ctx->ifc_cpus) + 1; - case IFLIB_INTR_RX: - case IFLIB_INTR_RXTX: - /* RX queues get the specified core */ - return qid / CPU_COUNT(&ctx->ifc_cpus); - default: - return -1; - } -} -#else -#define get_core_offset(ctx, type, qid) CPU_FIRST() -#define find_close_core(cpuid, tid) CPU_FIRST() -#define find_nth(ctx, gid) CPU_FIRST() -#endif - /* Just to avoid copy/paste */ static inline int iflib_irq_set_affinity(if_ctx_t ctx, if_irq_t irq, iflib_intr_type_t type, @@ -6099,21 +6231,14 @@ const char *name) { device_t dev; - int co, cpuid, err, tid; + unsigned int base_cpuid, cpuid; + int err; dev = ctx->ifc_dev; - co = ctx->ifc_sysctl_core_offset; - if (ctx->ifc_sysctl_separate_txrx && type == IFLIB_INTR_TX) - co += ctx->ifc_softc_ctx.isc_nrxqsets; - cpuid = find_nth(ctx, qid + co); - tid = get_core_offset(ctx, type, qid); - if (tid < 0) { - device_printf(dev, "get_core_offset failed\n"); - return (EOPNOTSUPP); - } - cpuid = find_close_core(cpuid, tid); - err = taskqgroup_attach_cpu(tqg, gtask, uniq, cpuid, dev, irq->ii_res, - name); + base_cpuid = ctx->ifc_sysctl_core_offset; + cpuid = get_cpuid_for_queue(ctx, base_cpuid, qid, type == IFLIB_INTR_TX); + err = taskqgroup_attach_cpu(tqg, gtask, uniq, cpuid, dev, + irq ? irq->ii_res : NULL, name); if (err) { device_printf(dev, "taskqgroup_attach_cpu failed %d\n", err); return (err); @@ -6202,8 +6327,8 @@ return (0); if (tqrid != -1) { - err = iflib_irq_set_affinity(ctx, irq, type, qid, gtask, tqg, - q, name); + err = iflib_irq_set_affinity(ctx, irq, type, qid, gtask, tqg, q, + name); if (err) return (err); } else { @@ -6216,6 +6341,7 @@ void iflib_softirq_alloc_generic(if_ctx_t ctx, if_irq_t irq, iflib_intr_type_t type, void *arg, int qid, const char *name) { + device_t dev; struct grouptask *gtask; struct taskqgroup *tqg; gtask_fn_t *fn; @@ -6247,14 +6373,11 @@ default: panic("unknown net intr type"); } - if (irq != NULL) { - err = iflib_irq_set_affinity(ctx, irq, type, qid, gtask, tqg, - q, name); - if (err) - taskqgroup_attach(tqg, gtask, q, ctx->ifc_dev, - irq->ii_res, name); - } else { - taskqgroup_attach(tqg, gtask, q, NULL, NULL, name); + err = iflib_irq_set_affinity(ctx, irq, type, qid, gtask, tqg, q, name); + if (err) { + dev = ctx->ifc_dev; + taskqgroup_attach(tqg, gtask, q, dev, irq ? irq->ii_res : NULL, + name); } } @@ -6736,6 +6859,9 @@ SYSCTL_ADD_U8(ctx_list, oid_list, OID_AUTO, "separate_txrx", CTLFLAG_RDTUN, &ctx->ifc_sysctl_separate_txrx, 0, "use separate cores for TX and RX"); + SYSCTL_ADD_U8(ctx_list, oid_list, OID_AUTO, "use_logical_cores", + CTLFLAG_RDTUN, &ctx->ifc_sysctl_use_logical_cores, 0, + "try to make use of logical cores for TX and RX"); /* XXX change for per-queue sizes */ SYSCTL_ADD_PROC(ctx_list, oid_list, OID_AUTO, "override_ntxds", @@ -6780,6 +6906,9 @@ queue_node = SYSCTL_ADD_NODE(ctx_list, child, OID_AUTO, namebuf, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Queue Name"); queue_list = SYSCTL_CHILDREN(queue_node); + SYSCTL_ADD_INT(ctx_list, queue_list, OID_AUTO, "cpu", + CTLFLAG_RD, + &txq->ift_task.gt_cpu, 0, "cpu this queue is bound to"); #if MEMORY_LOGGING SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txq_dequeued", CTLFLAG_RD, @@ -6862,6 +6991,9 @@ queue_node = SYSCTL_ADD_NODE(ctx_list, child, OID_AUTO, namebuf, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Queue Name"); queue_list = SYSCTL_CHILDREN(queue_node); + SYSCTL_ADD_INT(ctx_list, queue_list, OID_AUTO, "cpu", + CTLFLAG_RD, + &rxq->ifr_task.gt_cpu, 0, "cpu this queue is bound to"); if (sctx->isc_flags & IFLIB_HAS_RXCQ) { SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "rxq_cq_cidx", CTLFLAG_RD, diff --git a/sys/sys/cpuset.h b/sys/sys/cpuset.h --- a/sys/sys/cpuset.h +++ b/sys/sys/cpuset.h @@ -65,6 +65,7 @@ #define CPU_OR_ATOMIC(d, s) BIT_OR_ATOMIC(CPU_SETSIZE, d, s) #define CPU_COPY_STORE_REL(f, t) BIT_COPY_STORE_REL(CPU_SETSIZE, f, t) #define CPU_FFS(p) BIT_FFS(CPU_SETSIZE, p) +#define CPU_FLS(p) BIT_FLS(CPU_SETSIZE, p) #define CPU_COUNT(p) ((int)BIT_COUNT(CPU_SETSIZE, p)) #define CPUSET_FSET BITSET_FSET(_NCPUWORDS) #define CPUSET_T_INITIALIZER BITSET_T_INITIALIZER