diff --git a/usr.sbin/bhyve/acpi.h b/usr.sbin/bhyve/acpi.h --- a/usr.sbin/bhyve/acpi.h +++ b/usr.sbin/bhyve/acpi.h @@ -56,6 +56,7 @@ int acpi_build(struct vmctx *ctx, int ncpu); void acpi_raise_gpe(struct vmctx *ctx, unsigned bit); int acpi_tables_add_device(const struct acpi_device *const dev); +int acpi_add_vcpu_affinity(int vcpuid, int domain); void dsdt_line(const char *fmt, ...); void dsdt_fixed_ioport(uint16_t iobase, uint16_t length); void dsdt_fixed_irq(uint8_t irq); diff --git a/usr.sbin/bhyve/acpi.c b/usr.sbin/bhyve/acpi.c --- a/usr.sbin/bhyve/acpi.c +++ b/usr.sbin/bhyve/acpi.c @@ -37,9 +37,12 @@ */ #include +#include +#include #include #include #include +#include #include #include @@ -50,7 +53,9 @@ #include #include +#include #include +#include #include #include "bhyverun.h" @@ -78,6 +83,22 @@ static char basl_template[MAXPATHLEN]; static char basl_stemplate[MAXPATHLEN]; +/* + * SRAT vCPU affinity info. + */ +struct acpi_vcpu_affinity_entry { + RB_ENTRY(acpi_vcpu_affinity_entry) entry; + int vcpuid; + int domain; +}; + +static int vcpu_affinity_cmp(struct acpi_vcpu_affinity_entry *const a1, + struct acpi_vcpu_affinity_entry *const a2); +static RB_HEAD(vcpu_affinities, + acpi_vcpu_affinity_entry) aff_head = RB_INITIALIZER(&aff_head); +RB_GENERATE_STATIC(vcpu_affinities, acpi_vcpu_affinity_entry, entry, + vcpu_affinity_cmp); + /* * State for dsdt_line(), dsdt_indent(), and dsdt_unindent(). */ @@ -121,6 +142,31 @@ return (0); } +static int +vcpu_affinity_cmp(struct acpi_vcpu_affinity_entry *a1, + struct acpi_vcpu_affinity_entry *a2) +{ + return (a1->vcpuid < a2->vcpuid ? -1 : a1->vcpuid > a2->vcpuid); +} + +int +acpi_add_vcpu_affinity(int vcpuid, int domain) +{ + struct acpi_vcpu_affinity_entry *entry = calloc(1, sizeof(*entry)); + if (entry == NULL) { + return (ENOMEM); + } + + entry->vcpuid = vcpuid; + entry->domain = domain; + if (RB_INSERT(vcpu_affinities, &aff_head, entry) != NULL) { + free(entry); + return (EEXIST); + } + + return (0); +} + /* * Helper routines for writing to the DSDT from other modules. */ @@ -726,6 +772,83 @@ return (0); } +static int +build_srat(struct vmctx *const ctx) +{ + ACPI_TABLE_SRAT srat; + ACPI_SRAT_MEM_AFFINITY srat_mem_affinity; + ACPI_SRAT_CPU_AFFINITY srat_cpu_affinity; + + struct acpi_vcpu_affinity_entry *ep; + struct basl_table *table; + int segid, domain; + int _flags, _prot; + vm_ooffset_t _off; + size_t maplen; + uint64_t gpa; + int ret; + + if (RB_EMPTY(&aff_head)) + return (0); + + memset(&srat, 0, sizeof(srat)); + BASL_EXEC(basl_table_create(&table, ctx, ACPI_SIG_SRAT, + BASL_TABLE_ALIGNMENT)); + BASL_EXEC(basl_table_append_header(table, ACPI_SIG_SRAT, 1, 1)); + srat.TableRevision = 1; + BASL_EXEC(basl_table_append_content(table, &srat, sizeof(srat))); + + /* + * Iterate over the VM's memory maps and add + * a 'Memory Affinity Structure' for each mapping. + */ + gpa = 0; + while (1) { + ret = vm_mmap_getnext(ctx, &gpa, &segid, &_off, &maplen, &_prot, + &_flags); + if (ret) { + break; + } + + if (segid >= VM_SYSMEM && segid < VM_BOOTROM) { + domain = segid - VM_SYSMEM; + } else { + /* Treat devmem segs as domain 0. */ + domain = 0; + } + memset(&srat_mem_affinity, 0, sizeof(srat_mem_affinity)); + srat_mem_affinity.Header.Type = ACPI_SRAT_TYPE_MEMORY_AFFINITY; + srat_mem_affinity.Header.Length = sizeof(srat_mem_affinity); + srat_mem_affinity.Flags |= ACPI_SRAT_MEM_ENABLED; + srat_mem_affinity.ProximityDomain = htole32(domain); + srat_mem_affinity.BaseAddress = htole64(gpa); + srat_mem_affinity.Length = htole64(maplen); + srat_mem_affinity.Flags = htole32(ACPI_SRAT_MEM_ENABLED); + BASL_EXEC(basl_table_append_bytes(table, &srat_mem_affinity, + sizeof(srat_mem_affinity))); + gpa += maplen; + } + + /* + * Iterate over each "vCPUid to domain id" mapping and emit a + * 'Processor Local APIC/SAPIC Affinity Structure' for each entry. + */ + RB_FOREACH(ep, vcpu_affinities, &aff_head) { + memset(&srat_cpu_affinity, 0, sizeof(srat_cpu_affinity)); + srat_cpu_affinity.Header.Type = ACPI_SRAT_TYPE_CPU_AFFINITY; + srat_cpu_affinity.Header.Length = sizeof(srat_cpu_affinity); + srat_cpu_affinity.ProximityDomainLo = (uint8_t)ep->domain; + srat_cpu_affinity.ApicId = (uint8_t)ep->vcpuid; + srat_cpu_affinity.Flags = htole32(ACPI_SRAT_CPU_USE_AFFINITY); + BASL_EXEC(basl_table_append_bytes(table, &srat_cpu_affinity, + sizeof(srat_cpu_affinity))); + } + + BASL_EXEC(basl_table_register_to_rsdt(table)); + + return (0); +} + int acpi_build(struct vmctx *ctx, int ncpu) { @@ -765,6 +888,7 @@ BASL_EXEC(build_mcfg(ctx)); BASL_EXEC(build_facs(ctx)); BASL_EXEC(build_spcr(ctx)); + BASL_EXEC(build_srat(ctx)); /* Build ACPI device-specific tables such as a TPM2 table. */ const struct acpi_device_list_entry *entry; diff --git a/usr.sbin/bhyve/amd64/bhyverun_machdep.c b/usr.sbin/bhyve/amd64/bhyverun_machdep.c --- a/usr.sbin/bhyve/amd64/bhyverun_machdep.c +++ b/usr.sbin/bhyve/amd64/bhyverun_machdep.c @@ -91,6 +91,7 @@ " -K: PS2 keyboard layout\n" " -l: LPC device configuration\n" " -m: memory size\n" + " -n: NUMA domain specification\n" " -o: set config 'var' to 'value'\n" " -P: vmexit from the guest on pause\n" " -p: pin 'vcpu' to 'hostcpu'\n" @@ -117,9 +118,9 @@ int c; #ifdef BHYVE_SNAPSHOT - optstr = "aehuwxACDHIPSWYk:f:o:p:G:c:s:m:l:K:U:r:"; + optstr = "aehuwxACDHIPSWYk:f:o:p:G:c:s:m:n:l:K:U:r:"; #else - optstr = "aehuwxACDHIPSWYk:f:o:p:G:c:s:m:l:K:U:"; + optstr = "aehuwxACDHIPSWYk:f:o:p:G:c:s:m:n:l:K:U:"; #endif while ((c = getopt(argc, argv, optstr)) != -1) { switch (c) { @@ -194,6 +195,15 @@ case 'm': set_config_value("memory.size", optarg); break; + case 'n': + if (bhyve_numa_parse(optarg) != 0) + errx(EX_USAGE, + "invalid NUMA configuration " + "'%s'", + optarg); + if (!get_config_bool("acpi_tables")) + errx(EX_USAGE, "NUMA emulation requires ACPI"); + break; case 'o': if (!bhyve_parse_config_option(optarg)) { errx(EX_USAGE, diff --git a/usr.sbin/bhyve/bhyve.8 b/usr.sbin/bhyve/bhyve.8 --- a/usr.sbin/bhyve/bhyve.8 +++ b/usr.sbin/bhyve/bhyve.8 @@ -269,8 +269,55 @@ (either upper or lower case) to indicate a multiple of kilobytes, megabytes, gigabytes, or terabytes. If no suffix is given, the value is assumed to be in megabytes. -.Pp The default is 256M. +.Pp +.It Fl n Ar id Ns Cm \&, Ns Ar size Ns Cm \&, Ns Ar cpus Ns Op Cm \&, Ns Ar domain_policy +Configure guest NUMA domains. This option applies only to the amd64 platform. +.Pp +The +.Fl n +option allows the guest physical address space to be partitioned into domains. +The layout of each domain is encoded in an ACPI table +visible to the guest operating system. +The +.Fl n +option also allows the specification of a +.Xr domainset 9 +memory allocation policy for the host memory backing a given NUMA domain. +A guest can have up to 8 NUMA domains. +This feature requires that the guest use a boot ROM, and in +particular cannot be used if the guest was initialized using +.Xr bhyveload 8 . +.Pp +Each domain is identified by a numerical +.Em id . +The domain memory +.Em size +is specified using the same format as the +.Fl m +flag. +The sum of all +.Em size +parameters overrides the total VM memory size specified by the +.Fl m +flag. +However, if at least one domain memory size parameter is +missing, the total VM memory size will be equally distributed across +all emulated domains. +The +.Em cpuset +parameter specifies the set of CPUs that are part of the domain. +The +.Em domain_policy +parameter may be optionally used to configure the +.Xr domainset 9 +host NUMA memory allocation policy for an emulated +domain. +See the +.Ar -n +flag in +.Xr cpuset 1 +for a list of valid NUMA memory allocation policies and their formats. .It Fl o Ar var Ns Cm = Ns Ar value Set the configuration variable .Ar var @@ -1202,6 +1249,33 @@ .Bd -literal -offset indent /usr/sbin/bhyve -k configfile vm0 .Ed +.Pp +Run a UEFI virtual machine with four CPUs and two emulated NUMA domains: +.Bd -literal -offset indent +bhyve -c 4 -w -H \\ + -s 0,hostbridge \\ + -s 4,ahci-hd,disk.img \\ + -s 31,lpc -l com1,stdio \\ + -l bootrom,/usr/local/share/uefi-firmware/BHYVE_UEFI.fd \\ + -n id=0,size=4G,cpus=0-1 \\ + -n id=1,size=4G,cpus=2-3 \\ + numavm +.Ed +.Pp +Assuming a host machine with two NUMA domains, +run a UEFI virtual machine with four CPUs using a +.Ar prefer +.Xr domainset 9 +policy to allocate guest memory from the first host NUMA domain only. +.Bd -literal -offset indent +bhyve -c 2 -w -H \\ + -s 0,hostbridge \\ + -s 4,ahci-hd,disk.img \\ + -s 31,lpc -l com1,stdio \\ + -l bootrom,/usr/local/share/uefi-firmware/BHYVE_UEFI.fd \\ + -n id=0,size=4G,cpus=0-1,domain_policy=prefer:0 \\ + numavm +.Ed .Sh SEE ALSO .Xr bhyve 4 , .Xr netgraph 4 , @@ -1211,7 +1285,8 @@ .Xr bhyve_config 5 , .Xr ethers 5 , .Xr bhyvectl 8 , -.Xr bhyveload 8 +.Xr bhyveload 8 , +.Xr domainset 9 .Pp .Rs .%A Intel diff --git a/usr.sbin/bhyve/bhyverun.h b/usr.sbin/bhyve/bhyverun.h --- a/usr.sbin/bhyve/bhyverun.h +++ b/usr.sbin/bhyve/bhyverun.h @@ -73,6 +73,7 @@ #endif int bhyve_pincpu_parse(const char *opt); int bhyve_topology_parse(const char *opt); +int bhyve_numa_parse(const char *opt); void bhyve_init_vcpu(struct vcpu *vcpu); void bhyve_start_vcpu(struct vcpu *vcpu, bool bsp); diff --git a/usr.sbin/bhyve/bhyverun.c b/usr.sbin/bhyve/bhyverun.c --- a/usr.sbin/bhyve/bhyverun.c +++ b/usr.sbin/bhyve/bhyverun.c @@ -30,6 +30,8 @@ #ifndef WITHOUT_CAPSICUM #include #endif +#include +#include #include #ifdef BHYVE_SNAPSHOT #include @@ -54,6 +56,7 @@ #include #endif #include +#include #include #include #include @@ -68,6 +71,7 @@ #include #endif +#include #include #include "acpi.h" @@ -108,6 +112,9 @@ static cpuset_t cpumask; +static struct vm_mem_domain guest_domains[VM_MAXMEMDOM]; +static int guest_ndomains = 0; + static void vm_loop(struct vmctx *ctx, struct vcpu *vcpu); static struct vcpu_info { @@ -179,6 +186,117 @@ return (lval); } +int +bhyve_numa_parse(const char *opt) +{ + int id = -1; + nvlist_t *nvl; + char *cp, *str, *tofree; + char pathbuf[64] = { 0 }; + char *size = NULL, *cpus = NULL, *domain_policy = NULL; + + if (*opt == '\0') { + return (-1); + } + + tofree = str = strdup(opt); + if (str == NULL) + errx(4, "Failed to allocate memory"); + + while ((cp = strsep(&str, ",")) != NULL) { + if (strncmp(cp, "id=", strlen("id=")) == 0) + id = parse_int_value("id", cp + strlen("id="), 0, + UINT8_MAX); + else if (strncmp(cp, "size=", strlen("size=")) == 0) + size = cp + strlen("size="); + else if (strncmp(cp, + "domain_policy=", strlen("domain_policy=")) == 0) + domain_policy = cp + strlen("domain_policy="); + else if (strncmp(cp, "cpus=", strlen("cpus=")) == 0) + cpus = cp + strlen("cpus="); + } + + if (id == -1) { + EPRINTLN("Missing NUMA domain ID in '%s'", opt); + goto out; + } + + snprintf(pathbuf, sizeof(pathbuf), "domains.%d", id); + nvl = find_config_node(pathbuf); + if (nvl == NULL) + nvl = create_config_node(pathbuf); + if (size != NULL) + set_config_value_node(nvl, "size", size); + if (domain_policy != NULL) + set_config_value_node(nvl, "domain_policy", domain_policy); + if (cpus != NULL) + set_config_value_node(nvl, "cpus", cpus); + + free(tofree); + return (0); + +out: + free(tofree); + return (-1); +} + +static void +calc_mem_affinity(size_t vm_memsize) +{ + int i; + nvlist_t *nvl; + bool need_recalc; + const char *value; + struct vm_mem_domain *dom; + char pathbuf[64] = { 0 }; + + need_recalc = false; + for (i = 0; i < VM_MAXMEMDOM; i++) { + dom = &guest_domains[i]; + snprintf(pathbuf, sizeof(pathbuf), "domains.%d", i); + nvl = find_config_node(pathbuf); + if (nvl == NULL) { + break; + } + + value = get_config_value_node(nvl, "size"); + need_recalc |= value == NULL; + if (value != NULL && vm_parse_memsize(value, &dom->size)) { + errx(EX_USAGE, "invalid memsize for domain %d: '%s'", i, + value); + } + + dom->ds_mask = calloc(1, sizeof(domainset_t)); + if (dom->ds_mask == NULL) { + errx(EX_OSERR, "Failed to allocate domainset mask"); + } + value = get_config_value_node(nvl, "domain_policy"); + if (value == NULL) { + dom->ds_policy = DOMAINSET_POLICY_INVALID; + DOMAINSET_ZERO(dom->ds_mask); + } else if (domainset_parselist(value, dom->ds_mask, &dom->ds_policy) != + CPUSET_PARSE_OK) { + errx(EX_USAGE, "failed to parse domain policy '%s'", value); + } + } + + guest_ndomains = i; + if (guest_ndomains == 0) { + /* + * No domains were specified - create domain + * 0 holding all CPUs and memory. + */ + guest_ndomains = 1; + guest_domains[0].size = vm_memsize; + } else if (need_recalc) { + warnx("At least one domain memory size was not specified, distributing" + " total VM memory size across all domains"); + for (i = 0; i < guest_ndomains; i++) { + guest_domains[i].size = vm_memsize / guest_ndomains; + } + } +} + /* * Set the sockets, cores, threads, and guest_cpus variables based on * the configured topology. @@ -340,6 +458,56 @@ } } +static void +set_vcpu_affinities(void) +{ + int cpu, error; + nvlist_t *nvl = NULL; + cpuset_t cpus; + const char *value; + char pathbuf[64] = { 0 }; + + for (int dom = 0; dom < guest_ndomains; dom++) { + snprintf(pathbuf, sizeof(pathbuf), "domains.%d", dom); + nvl = find_config_node(pathbuf); + if (nvl == NULL) + break; + + value = get_config_value_node(nvl, "cpus"); + if (value == NULL) { + EPRINTLN("Missing CPU set for domain %d", dom); + exit(4); + } + + parse_cpuset(dom, value, &cpus); + CPU_FOREACH_ISSET(cpu, &cpus) { + error = acpi_add_vcpu_affinity(cpu, dom); + if (error) { + EPRINTLN( + "Unable to set vCPU %d affinity for domain %d: %s", + cpu, dom, strerror(errno)); + exit(4); + } + } + } + if (guest_ndomains > 1 || nvl != NULL) + return; + + /* + * If we're dealing with one domain and no cpuset was provided, create a + * default one holding all cpus. + */ + for (cpu = 0; cpu < guest_ncpus; cpu++) { + error = acpi_add_vcpu_affinity(cpu, 0); + if (error) { + EPRINTLN( + "Unable to set vCPU %d affinity for domain %d: %s", + cpu, 0, strerror(errno)); + exit(4); + } + } +} + void * paddr_guest2host(struct vmctx *ctx, uintptr_t gaddr, size_t len) { @@ -713,18 +881,21 @@ vcpu_info[vcpuid].vcpu = vm_vcpu_open(ctx, vcpuid); } + calc_mem_affinity(memsize); memflags = 0; if (get_config_bool_default("memory.wired", false)) memflags |= VM_MEM_F_WIRED; if (get_config_bool_default("memory.guest_in_core", false)) memflags |= VM_MEM_F_INCORE; vm_set_memflags(ctx, memflags); - error = vm_setup_memory(ctx, memsize, VM_MMAP_ALL); + error = vm_setup_memory_domains(ctx, VM_MMAP_ALL, guest_domains, + guest_ndomains); if (error) { fprintf(stderr, "Unable to setup memory (%d)\n", errno); exit(4); } + set_vcpu_affinities(); init_mem(guest_ncpus); init_bootrom(ctx); if (bhyve_init_platform(ctx, bsp) != 0) diff --git a/usr.sbin/bhyve/bootrom.c b/usr.sbin/bhyve/bootrom.c --- a/usr.sbin/bhyve/bootrom.c +++ b/usr.sbin/bhyve/bootrom.c @@ -31,6 +31,7 @@ #include #include +#include #include #include diff --git a/usr.sbin/bhyve/pci_emul.c b/usr.sbin/bhyve/pci_emul.c --- a/usr.sbin/bhyve/pci_emul.c +++ b/usr.sbin/bhyve/pci_emul.c @@ -42,6 +42,7 @@ #include #include +#include #include #include #include diff --git a/usr.sbin/bhyve/pci_fbuf.c b/usr.sbin/bhyve/pci_fbuf.c --- a/usr.sbin/bhyve/pci_fbuf.c +++ b/usr.sbin/bhyve/pci_fbuf.c @@ -29,6 +29,7 @@ #include #include +#include #include #include #include diff --git a/usr.sbin/bhyve/pci_passthru.c b/usr.sbin/bhyve/pci_passthru.c --- a/usr.sbin/bhyve/pci_passthru.c +++ b/usr.sbin/bhyve/pci_passthru.c @@ -38,6 +38,7 @@ #include #include +#include #include