diff --git a/usr.sbin/bhyve/acpi.c b/usr.sbin/bhyve/acpi.c --- a/usr.sbin/bhyve/acpi.c +++ b/usr.sbin/bhyve/acpi.c @@ -40,6 +40,8 @@ #include #include #include +#include +#include #include #include @@ -51,6 +53,7 @@ #include #include +#include #include #include "bhyverun.h" @@ -726,6 +729,93 @@ return (0); } +static int +build_srat(struct vmctx *const ctx) +{ + ACPI_TABLE_SRAT srat; + ACPI_SRAT_MEM_AFFINITY srat_mem_affinity; + ACPI_SRAT_CPU_AFFINITY srat_cpu_affinity; + + struct basl_table *table; + int segid, domain; + int _flags, _prot; + vm_ooffset_t _off; + u_int32_t cpu_id; + cpuset_t cpus; + size_t maplen; + uint64_t gpa; + int ret; + + BASL_EXEC(basl_table_create(&table, ctx, ACPI_SIG_SRAT, + BASL_TABLE_ALIGNMENT)); + + memset(&srat, 0, sizeof(srat)); + BASL_EXEC(basl_table_append_header(table, ACPI_SIG_SRAT, 1, 1)); + srat.TableRevision = 1; + BASL_EXEC(basl_table_append_content(table, &srat, sizeof(srat))); + + /* + * Iterate over the VM's memory maps and add + * a 'Memory Affinity Structure' for each mapping. + */ + gpa = 0; + while (1) { + ret = vm_mmap_getnext(ctx, &gpa, &segid, &_off, &maplen, &_prot, + &_flags); + if (ret) { + break; + } + + if (segid >= VM_SYSMEM && segid < VM_BOOTROM) { + domain = segid - VM_SYSMEM; + } else { + /* Treat devmem segs as domain 0. */ + domain = 0; + } + memset(&srat_mem_affinity, 0, sizeof(srat_mem_affinity)); + srat_mem_affinity.Header.Type = ACPI_SRAT_TYPE_MEMORY_AFFINITY; + srat_mem_affinity.Header.Length = sizeof(srat_mem_affinity); + srat_mem_affinity.Flags |= ACPI_SRAT_MEM_ENABLED; + srat_mem_affinity.ProximityDomain = htole32(domain); + srat_mem_affinity.BaseAddress = htole64(gpa); + srat_mem_affinity.Length = htole64(maplen); + srat_mem_affinity.Flags = htole32(ACPI_SRAT_MEM_ENABLED); + BASL_EXEC(basl_table_append_bytes(table, &srat_mem_affinity, + sizeof(srat_mem_affinity))); + gpa += maplen; + } + + domain = 0; + while (domain < VM_MAXMEMDOM) { + ret = vm_get_domain_cpus(ctx, domain, &cpus); + if (ret) { + if (errno == ENOENT) + break; + return (ret); + } + /* Add all domain CPUs. */ + CPU_FOREACH_ISSET(cpu_id, &cpus) { + memset(&srat_cpu_affinity, 0, + sizeof(srat_cpu_affinity)); + srat_cpu_affinity.Header.Type = + ACPI_SRAT_TYPE_CPU_AFFINITY; + srat_cpu_affinity.Header.Length = sizeof( + srat_cpu_affinity); + srat_cpu_affinity.ProximityDomainLo = (uint8_t)domain; + srat_cpu_affinity.ApicId = (uint8_t)cpu_id; + srat_cpu_affinity.Flags = htole32( + ACPI_SRAT_CPU_USE_AFFINITY); + BASL_EXEC(basl_table_append_bytes(table, + &srat_cpu_affinity, sizeof(srat_cpu_affinity))); + } + domain++; + } + + BASL_EXEC(basl_table_register_to_rsdt(table)); + + return (0); +} + int acpi_build(struct vmctx *ctx, int ncpu) { @@ -765,6 +855,7 @@ BASL_EXEC(build_mcfg(ctx)); BASL_EXEC(build_facs(ctx)); BASL_EXEC(build_spcr(ctx)); + BASL_EXEC(build_srat(ctx)); /* Build ACPI device-specific tables such as a TPM2 table. */ const struct acpi_device_list_entry *entry; diff --git a/usr.sbin/bhyve/amd64/bhyverun_machdep.c b/usr.sbin/bhyve/amd64/bhyverun_machdep.c --- a/usr.sbin/bhyve/amd64/bhyverun_machdep.c +++ b/usr.sbin/bhyve/amd64/bhyverun_machdep.c @@ -90,6 +90,7 @@ " -K: PS2 keyboard layout\n" " -l: LPC device configuration\n" " -m: memory size\n" + " -n: NUMA domain specification\n" " -o: set config 'var' to 'value'\n" " -P: vmexit from the guest on pause\n" " -p: pin 'vcpu' to 'hostcpu'\n" @@ -116,9 +117,9 @@ int c; #ifdef BHYVE_SNAPSHOT - optstr = "aehuwxACDHIPSWYk:f:o:p:G:c:s:m:l:K:U:r:"; + optstr = "aehuwxACDHIPSWYk:f:o:p:G:c:s:m:n:l:K:U:r:"; #else - optstr = "aehuwxACDHIPSWYk:f:o:p:G:c:s:m:l:K:U:"; + optstr = "aehuwxACDHIPSWYk:f:o:p:G:c:s:m:n:l:K:U:"; #endif while ((c = getopt(argc, argv, optstr)) != -1) { switch (c) { @@ -193,6 +194,13 @@ case 'm': set_config_value("memory.size", optarg); break; + case 'n': + if (bhyve_numa_parse(optarg) != 0) + errx(EX_USAGE, + "invalid NUMA configuration " + "'%s'", + optarg); + break; case 'o': if (!bhyve_parse_config_option(optarg)) { errx(EX_USAGE, diff --git a/usr.sbin/bhyve/bhyve.8 b/usr.sbin/bhyve/bhyve.8 --- a/usr.sbin/bhyve/bhyve.8 +++ b/usr.sbin/bhyve/bhyve.8 @@ -269,8 +269,38 @@ (either upper or lower case) to indicate a multiple of kilobytes, megabytes, gigabytes, or terabytes. If no suffix is given, the value is assumed to be in megabytes. -.Pp The default is 256M. +.Pp +.It Fl n Ar id Ns Cm \&, Ns Ar size Ns Cm \&, Ns Ar cpus Ns Op Cm \&, Ns Ar domain_policy +Configure guest NUMA domains. +.Pp +.Nm +provides NUMA emulation and can optionally be configured +to allocate memory for emulated domain from a specific NUMA +domain on the host. A guest can have up to 8 NUMA domains. +This feature will not work with guests that don't use a boot ROM. +.Pp +Each domain is identified by a numerical +.Em id. +The domain memory +.Em size +is specified using the same format as the +.Fl m +flag. +The +.Em cpuset +parameter specifies the set of CPUs that are part of the domain. +The +.Em domain_policy +parameter may be optionally used to configure the +.Xr domainset 9 +NUMA memory allocation policy for an emulated domain. +See the +.Ar -n +flag in +.Xr cpuset 1 +for a list of valid NUMA memory allocation policies and their formats. + .It Fl o Ar var Ns Cm = Ns Ar value Set the configuration variable .Ar var @@ -1184,6 +1214,33 @@ .Bd -literal -offset indent /usr/sbin/bhyve -k configfile vm0 .Ed +.Pp +Run a UEFI virtual machine with four CPUs and two emulated NUMA domains: +.Bd -literal -offset indent +bhyve -c 4 -m 8G -w -H \\ + -s 0,hostbridge \\ + -s 4,ahci-hd,disk.img \\ + -s 31,lpc -l com1,stdio \\ + -l bootrom,/usr/local/share/uefi-firmware/BHYVE_UEFI.fd \\ + -n id=0,size=4G,cpus=0-1 \\ + -n id=1,size=4G,cpus=2-3 \\ + numavm +.Ed +.Pp +Assuming a host machine with two NUMA domains, +run a UEFI virtual machine with four CPUs using a +.Ar prefer +.Xr domainset 9 +policy to allocate guest memory from the first host NUMA domain only. +.Bd -literal -offset indent +bhyve -c 2 -m 4G -w -H \\ + -s 0,hostbridge \\ + -s 4,ahci-hd,disk.img \\ + -s 31,lpc -l com1,stdio \\ + -l bootrom,/usr/local/share/uefi-firmware/BHYVE_UEFI.fd \\ + -n id=0,size=4G,cpus=0-1,domain_policy=prefer:0 \\ + numavm +.Ed .Sh SEE ALSO .Xr bhyve 4 , .Xr netgraph 4 , @@ -1194,6 +1251,7 @@ .Xr ethers 5 , .Xr bhyvectl 8 , .Xr bhyveload 8 +.Xr domainset 9 .Pp .Rs .%A Intel diff --git a/usr.sbin/bhyve/bhyverun.h b/usr.sbin/bhyve/bhyverun.h --- a/usr.sbin/bhyve/bhyverun.h +++ b/usr.sbin/bhyve/bhyverun.h @@ -73,6 +73,7 @@ #endif int bhyve_pincpu_parse(const char *opt); int bhyve_topology_parse(const char *opt); +int bhyve_numa_parse(const char *opt); void bhyve_init_vcpu(struct vcpu *vcpu); void bhyve_start_vcpu(struct vcpu *vcpu, bool bsp); diff --git a/usr.sbin/bhyve/bhyverun.c b/usr.sbin/bhyve/bhyverun.c --- a/usr.sbin/bhyve/bhyverun.c +++ b/usr.sbin/bhyve/bhyverun.c @@ -53,6 +53,9 @@ #ifdef BHYVE_SNAPSHOT #include #endif +#include +#include +#include #include #include #include @@ -108,6 +111,9 @@ static cpuset_t cpumask; +static struct vmdom guest_domains[VM_MAXMEMDOM]; +static int guest_ndomains; + static void vm_loop(struct vmctx *ctx, struct vcpu *vcpu); static struct vcpu_info { @@ -179,6 +185,120 @@ return (lval); } +int +bhyve_numa_parse(const char *opt) +{ + int id = -1; + nvlist_t *nvl; + char *cp, *str, *tofree; + char pathbuf[64] = { 0 }; + char *size = NULL, *cpus = NULL, *domain_policy = NULL; + + if (*opt == '\0') { + return (-1); + } + + tofree = str = strdup(opt); + if (str == NULL) + errx(4, "Failed to allocate memory"); + + while ((cp = strsep(&str, ",")) != NULL) { + if (strncmp(cp, "id=", strlen("id=")) == 0) + id = parse_int_value("id", cp + strlen("id="), 0, + UINT8_MAX); + else if (strncmp(cp, "size=", strlen("size=")) == 0) + size = cp + strlen("size="); + else if (strncmp(cp, + "domain_policy=", strlen("domain_policy=")) == 0) + domain_policy = cp + strlen("domain_policy="); + else if (strncmp(cp, "cpus=", strlen("cpus=")) == 0) + cpus = cp + strlen("cpus="); + } + + if (id == -1) { + EPRINTLN("Missing NUMA domain ID in '%s'", opt); + goto out; + } + + snprintf(pathbuf, 64, "domains.%d", id); + nvl = find_config_node(pathbuf); + if (nvl == NULL) + nvl = create_config_node(pathbuf); + if (size != NULL) + set_config_value_node(nvl, "size", size); + if (domain_policy != NULL) + set_config_value_node(nvl, "domain_policy", domain_policy); + if (cpus != NULL) + set_config_value_node(nvl, "cpus", cpus); + + free(tofree); + return (0); + +out: + free(tofree); + return (-1); +} + +static void +calc_mem_affinity(size_t vm_memsize) +{ + int i, error; + nvlist_t *nvl; + const char *value; + struct vmdom *dom; + char pathbuf[64] = { 0 }; + size_t total_size = 0; + + i = 0; + while (i < VM_MAXMEMDOM) { + snprintf(pathbuf, 64, "domains.%d", i); + nvl = find_config_node(pathbuf); + if (nvl == NULL) + break; + + dom = &guest_domains[i]; + /* Check if all necessary properties have been defined. */ + value = get_config_value_node(nvl, "size"); + if (value == NULL) { + EPRINTLN("Missing memory size for domain %d", i); + exit(4); + } + error = vm_parse_memsize(value, &dom->size); + if (error) + errx(EX_USAGE, "invalid memsize for domain %d: '%s'", i, + value); + + value = get_config_value_node(nvl, "domain_policy"); + if (value == NULL) { + dom->ds_policy = DOMAINSET_POLICY_INVALID; + DOMAINSET_ZERO(&dom->ds_mask); + } else { + if (domainset_parselist(value, &dom->ds_mask, + &dom->ds_policy) != CPUSET_PARSE_OK) { + errx(EX_USAGE, + "failed to parse domain policy '%s'", + value); + } + } + i++; + total_size += dom->size; + } + + if (i == 0) { + /* + * No domains were specified - create domain + * 0 holding all CPUs and memory. + */ + i = 1; + dom = &guest_domains[0]; + dom->size = vm_memsize; + } else if (total_size != vm_memsize) { + errx(EX_USAGE, + "Total NUMA domain memory size does not match provided VM memsize"); + } + guest_ndomains = i; +} + /* * Set the sockets, cores, threads, and guest_cpus variables based on * the configured topology. @@ -340,6 +460,75 @@ } } +static void +set_domain_cpus(struct vmctx *ctx) +{ + int i, error; + nvlist_t *nvl = NULL; + cpuset_t cpus; + const char *value, *reason; + char pathbuf[64] = { 0 }; + + for (i = 0; i < guest_ndomains; i++) { + snprintf(pathbuf, 64, "domains.%d", i); + nvl = find_config_node(pathbuf); + if (nvl == NULL) + break; + + value = get_config_value_node(nvl, "cpus"); + if (value == NULL) { + EPRINTLN("Missing CPU set for domain %d", i); + exit(4); + } + + parse_cpuset(i, value, &cpus); + error = vm_set_domain_cpus(ctx, i, &cpus); + if (error) { + switch (errno) { + case ENOENT: + reason = "domain does not exist"; + break; + case EEXIST: + reason = "overlapping CPU sets"; + break; + default: + reason = strerror(errno); + break; + } + EPRINTLN("Unable to set CPU affinity for domain %d: %s", + i, reason); + exit(4); + } + } + + /* + * If we're dealing with one domain and no cpuset was provided, create a + * default one holding all cpus. + */ + if (guest_ndomains == 1 && nvl == NULL) { + CPU_ZERO(&cpus); + for (i = 0; i < guest_ncpus; i++) + CPU_SET(i, &cpus); + error = vm_set_domain_cpus(ctx, 0, &cpus); + if (error) { + switch (errno) { + case ENOENT: + reason = "domain does not exist"; + break; + case EEXIST: + reason = "overlapping CPU sets"; + break; + default: + reason = strerror(errno); + break; + } + EPRINTLN("Unable to set CPU affinity for domain %d: %s", + i, reason); + exit(4); + } + } +} + void * paddr_guest2host(struct vmctx *ctx, uintptr_t gaddr, size_t len) { @@ -738,18 +927,21 @@ vcpu_info[vcpuid].vcpu = vm_vcpu_open(ctx, vcpuid); } + calc_mem_affinity(memsize); memflags = 0; if (get_config_bool_default("memory.wired", false)) memflags |= VM_MEM_F_WIRED; if (get_config_bool_default("memory.guest_in_core", false)) memflags |= VM_MEM_F_INCORE; vm_set_memflags(ctx, memflags); - error = vm_setup_memory(ctx, memsize, VM_MMAP_ALL); + error = vm_setup_memory(ctx, guest_domains, guest_ndomains, memsize, + VM_MMAP_ALL); if (error) { fprintf(stderr, "Unable to setup memory (%d)\n", errno); exit(4); } + set_domain_cpus(ctx); init_mem(guest_ncpus); init_bootrom(ctx); if (bhyve_init_platform(ctx, bsp) != 0)