Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F142202297
D44567.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
16 KB
Referenced Files
None
Subscribers
None
D44567.diff
View Options
diff --git a/usr.sbin/bhyve/acpi.h b/usr.sbin/bhyve/acpi.h
--- a/usr.sbin/bhyve/acpi.h
+++ b/usr.sbin/bhyve/acpi.h
@@ -56,6 +56,7 @@
int acpi_build(struct vmctx *ctx, int ncpu);
void acpi_raise_gpe(struct vmctx *ctx, unsigned bit);
int acpi_tables_add_device(const struct acpi_device *const dev);
+int acpi_add_vcpu_affinity(int vcpuid, int domain);
void dsdt_line(const char *fmt, ...);
void dsdt_fixed_ioport(uint16_t iobase, uint16_t length);
void dsdt_fixed_irq(uint8_t irq);
diff --git a/usr.sbin/bhyve/acpi.c b/usr.sbin/bhyve/acpi.c
--- a/usr.sbin/bhyve/acpi.c
+++ b/usr.sbin/bhyve/acpi.c
@@ -37,9 +37,12 @@
*/
#include <sys/param.h>
+#include <sys/cpuset.h>
+#include <sys/domainset.h>
#include <sys/endian.h>
#include <sys/errno.h>
#include <sys/stat.h>
+#include <sys/tree.h>
#include <err.h>
#include <paths.h>
@@ -50,7 +53,9 @@
#include <string.h>
#include <unistd.h>
+#include <dev/vmm/vmm_mem.h>
#include <machine/vmm.h>
+#include <machine/vmm_dev.h>
#include <vmmapi.h>
#include "bhyverun.h"
@@ -78,6 +83,22 @@
static char basl_template[MAXPATHLEN];
static char basl_stemplate[MAXPATHLEN];
+/*
+ * SRAT vCPU affinity info.
+ */
+struct acpi_vcpu_affinity_entry {
+ RB_ENTRY(acpi_vcpu_affinity_entry) entry;
+ int vcpuid;
+ int domain;
+};
+
+static int vcpu_affinity_cmp(struct acpi_vcpu_affinity_entry *const a1,
+ struct acpi_vcpu_affinity_entry *const a2);
+static RB_HEAD(vcpu_affinities,
+ acpi_vcpu_affinity_entry) aff_head = RB_INITIALIZER(&aff_head);
+RB_GENERATE_STATIC(vcpu_affinities, acpi_vcpu_affinity_entry, entry,
+ vcpu_affinity_cmp);
+
/*
* State for dsdt_line(), dsdt_indent(), and dsdt_unindent().
*/
@@ -121,6 +142,31 @@
return (0);
}
+static int
+vcpu_affinity_cmp(struct acpi_vcpu_affinity_entry *a1,
+ struct acpi_vcpu_affinity_entry *a2)
+{
+ return (a1->vcpuid < a2->vcpuid ? -1 : a1->vcpuid > a2->vcpuid);
+}
+
+int
+acpi_add_vcpu_affinity(int vcpuid, int domain)
+{
+ struct acpi_vcpu_affinity_entry *entry = calloc(1, sizeof(*entry));
+ if (entry == NULL) {
+ return (ENOMEM);
+ }
+
+ entry->vcpuid = vcpuid;
+ entry->domain = domain;
+ if (RB_INSERT(vcpu_affinities, &aff_head, entry) != NULL) {
+ free(entry);
+ return (EEXIST);
+ }
+
+ return (0);
+}
+
/*
* Helper routines for writing to the DSDT from other modules.
*/
@@ -726,6 +772,83 @@
return (0);
}
+static int
+build_srat(struct vmctx *const ctx)
+{
+ ACPI_TABLE_SRAT srat;
+ ACPI_SRAT_MEM_AFFINITY srat_mem_affinity;
+ ACPI_SRAT_CPU_AFFINITY srat_cpu_affinity;
+
+ struct acpi_vcpu_affinity_entry *ep;
+ struct basl_table *table;
+ int segid, domain;
+ int _flags, _prot;
+ vm_ooffset_t _off;
+ size_t maplen;
+ uint64_t gpa;
+ int ret;
+
+ if (RB_EMPTY(&aff_head))
+ return (0);
+
+ memset(&srat, 0, sizeof(srat));
+ BASL_EXEC(basl_table_create(&table, ctx, ACPI_SIG_SRAT,
+ BASL_TABLE_ALIGNMENT));
+ BASL_EXEC(basl_table_append_header(table, ACPI_SIG_SRAT, 1, 1));
+ srat.TableRevision = 1;
+ BASL_EXEC(basl_table_append_content(table, &srat, sizeof(srat)));
+
+ /*
+ * Iterate over the VM's memory maps and add
+ * a 'Memory Affinity Structure' for each mapping.
+ */
+ gpa = 0;
+ while (1) {
+ ret = vm_mmap_getnext(ctx, &gpa, &segid, &_off, &maplen, &_prot,
+ &_flags);
+ if (ret) {
+ break;
+ }
+
+ if (segid >= VM_SYSMEM && segid < VM_BOOTROM) {
+ domain = segid - VM_SYSMEM;
+ } else {
+ /* Treat devmem segs as domain 0. */
+ domain = 0;
+ }
+ memset(&srat_mem_affinity, 0, sizeof(srat_mem_affinity));
+ srat_mem_affinity.Header.Type = ACPI_SRAT_TYPE_MEMORY_AFFINITY;
+ srat_mem_affinity.Header.Length = sizeof(srat_mem_affinity);
+ srat_mem_affinity.Flags |= ACPI_SRAT_MEM_ENABLED;
+ srat_mem_affinity.ProximityDomain = htole32(domain);
+ srat_mem_affinity.BaseAddress = htole64(gpa);
+ srat_mem_affinity.Length = htole64(maplen);
+ srat_mem_affinity.Flags = htole32(ACPI_SRAT_MEM_ENABLED);
+ BASL_EXEC(basl_table_append_bytes(table, &srat_mem_affinity,
+ sizeof(srat_mem_affinity)));
+ gpa += maplen;
+ }
+
+ /*
+ * Iterate over each "vCPUid to domain id" mapping and emit a
+ * 'Processor Local APIC/SAPIC Affinity Structure' for each entry.
+ */
+ RB_FOREACH(ep, vcpu_affinities, &aff_head) {
+ memset(&srat_cpu_affinity, 0, sizeof(srat_cpu_affinity));
+ srat_cpu_affinity.Header.Type = ACPI_SRAT_TYPE_CPU_AFFINITY;
+ srat_cpu_affinity.Header.Length = sizeof(srat_cpu_affinity);
+ srat_cpu_affinity.ProximityDomainLo = (uint8_t)ep->domain;
+ srat_cpu_affinity.ApicId = (uint8_t)ep->vcpuid;
+ srat_cpu_affinity.Flags = htole32(ACPI_SRAT_CPU_USE_AFFINITY);
+ BASL_EXEC(basl_table_append_bytes(table, &srat_cpu_affinity,
+ sizeof(srat_cpu_affinity)));
+ }
+
+ BASL_EXEC(basl_table_register_to_rsdt(table));
+
+ return (0);
+}
+
int
acpi_build(struct vmctx *ctx, int ncpu)
{
@@ -765,6 +888,7 @@
BASL_EXEC(build_mcfg(ctx));
BASL_EXEC(build_facs(ctx));
BASL_EXEC(build_spcr(ctx));
+ BASL_EXEC(build_srat(ctx));
/* Build ACPI device-specific tables such as a TPM2 table. */
const struct acpi_device_list_entry *entry;
diff --git a/usr.sbin/bhyve/amd64/bhyverun_machdep.c b/usr.sbin/bhyve/amd64/bhyverun_machdep.c
--- a/usr.sbin/bhyve/amd64/bhyverun_machdep.c
+++ b/usr.sbin/bhyve/amd64/bhyverun_machdep.c
@@ -91,6 +91,7 @@
" -K: PS2 keyboard layout\n"
" -l: LPC device configuration\n"
" -m: memory size\n"
+ " -n: NUMA domain specification\n"
" -o: set config 'var' to 'value'\n"
" -P: vmexit from the guest on pause\n"
" -p: pin 'vcpu' to 'hostcpu'\n"
@@ -117,9 +118,9 @@
int c;
#ifdef BHYVE_SNAPSHOT
- optstr = "aehuwxACDHIPSWYk:f:o:p:G:c:s:m:l:K:U:r:";
+ optstr = "aehuwxACDHIPSWYk:f:o:p:G:c:s:m:n:l:K:U:r:";
#else
- optstr = "aehuwxACDHIPSWYk:f:o:p:G:c:s:m:l:K:U:";
+ optstr = "aehuwxACDHIPSWYk:f:o:p:G:c:s:m:n:l:K:U:";
#endif
while ((c = getopt(argc, argv, optstr)) != -1) {
switch (c) {
@@ -194,6 +195,15 @@
case 'm':
set_config_value("memory.size", optarg);
break;
+ case 'n':
+ if (bhyve_numa_parse(optarg) != 0)
+ errx(EX_USAGE,
+ "invalid NUMA configuration "
+ "'%s'",
+ optarg);
+ if (!get_config_bool("acpi_tables"))
+ errx(EX_USAGE, "NUMA emulation requires ACPI");
+ break;
case 'o':
if (!bhyve_parse_config_option(optarg)) {
errx(EX_USAGE,
diff --git a/usr.sbin/bhyve/bhyve.8 b/usr.sbin/bhyve/bhyve.8
--- a/usr.sbin/bhyve/bhyve.8
+++ b/usr.sbin/bhyve/bhyve.8
@@ -269,8 +269,56 @@
(either upper or lower case)
to indicate a multiple of kilobytes, megabytes, gigabytes, or terabytes.
If no suffix is given, the value is assumed to be in megabytes.
-.Pp
The default is 256M.
+.Pp
+.It Fl n Ar id Ns Cm \&, Ns Ar size Ns Cm \&, Ns Ar cpus Ns Op Cm \&, Ns Ar domain_policy
+Configure guest NUMA domains.
+This option applies only to the amd64 platform.
+.Pp
+The
+.Fl n
+option allows the guest physical address space to be partitioned into domains.
+The layout of each domain is encoded in an ACPI table
+visible to the guest operating system.
+The
+.Fl n
+option also allows the specification of a
+.Xr domainset 9
+memory allocation policy for the host memory backing a given NUMA domain.
+A guest can have up to 8 NUMA domains.
+This feature requires that the guest use a boot ROM, and in
+particular cannot be used if the guest was initialized using
+.Xr bhyveload 8 .
+.Pp
+Each domain is identified by a numerical
+.Em id .
+The domain memory
+.Em size
+is specified using the same format as the
+.Fl m
+flag.
+The sum of all
+.Em size
+parameters overrides the total VM memory size specified by the
+.Fl m
+flag.
+However, if at least one domain memory size parameter is
+missing, the total VM memory size will be equally distributed across
+all emulated domains.
+The
+.Em cpuset
+parameter specifies the set of CPUs that are part of the domain.
+The
+.Em domain_policy
+parameter may be optionally used to configure the
+.Xr domainset 9
+host NUMA memory allocation policy for an emulated
+domain.
+See the
+.Ar -n
+flag in
+.Xr cpuset 1
+for a list of valid NUMA memory allocation policies and their formats.
.It Fl o Ar var Ns Cm = Ns Ar value
Set the configuration variable
.Ar var
@@ -1202,6 +1250,33 @@
.Bd -literal -offset indent
/usr/sbin/bhyve -k configfile vm0
.Ed
+.Pp
+Run a UEFI virtual machine with four CPUs and two emulated NUMA domains:
+.Bd -literal -offset indent
+bhyve -c 4 -w -H \\
+ -s 0,hostbridge \\
+ -s 4,ahci-hd,disk.img \\
+ -s 31,lpc -l com1,stdio \\
+ -l bootrom,/usr/local/share/uefi-firmware/BHYVE_UEFI.fd \\
+ -n id=0,size=4G,cpus=0-1 \\
+ -n id=1,size=4G,cpus=2-3 \\
+ numavm
+.Ed
+.Pp
+Assuming a host machine with two NUMA domains,
+run a UEFI virtual machine with four CPUs using a
+.Ar prefer
+.Xr domainset 9
+policy to allocate guest memory from the first host NUMA domain only.
+.Bd -literal -offset indent
+bhyve -c 2 -w -H \\
+ -s 0,hostbridge \\
+ -s 4,ahci-hd,disk.img \\
+ -s 31,lpc -l com1,stdio \\
+ -l bootrom,/usr/local/share/uefi-firmware/BHYVE_UEFI.fd \\
+ -n id=0,size=4G,cpus=0-1,domain_policy=prefer:0 \\
+ numavm
+.Ed
.Sh SEE ALSO
.Xr bhyve 4 ,
.Xr netgraph 4 ,
@@ -1211,7 +1286,8 @@
.Xr bhyve_config 5 ,
.Xr ethers 5 ,
.Xr bhyvectl 8 ,
-.Xr bhyveload 8
+.Xr bhyveload 8 ,
+.Xr domainset 9
.Pp
.Rs
.%A Intel
diff --git a/usr.sbin/bhyve/bhyverun.h b/usr.sbin/bhyve/bhyverun.h
--- a/usr.sbin/bhyve/bhyverun.h
+++ b/usr.sbin/bhyve/bhyverun.h
@@ -73,6 +73,7 @@
#endif
int bhyve_pincpu_parse(const char *opt);
int bhyve_topology_parse(const char *opt);
+int bhyve_numa_parse(const char *opt);
void bhyve_init_vcpu(struct vcpu *vcpu);
void bhyve_start_vcpu(struct vcpu *vcpu, bool bsp);
diff --git a/usr.sbin/bhyve/bhyverun.c b/usr.sbin/bhyve/bhyverun.c
--- a/usr.sbin/bhyve/bhyverun.c
+++ b/usr.sbin/bhyve/bhyverun.c
@@ -30,6 +30,8 @@
#ifndef WITHOUT_CAPSICUM
#include <sys/capsicum.h>
#endif
+#include <sys/cpuset.h>
+#include <sys/domainset.h>
#include <sys/mman.h>
#ifdef BHYVE_SNAPSHOT
#include <sys/socket.h>
@@ -54,6 +56,7 @@
#include <fcntl.h>
#endif
#include <libgen.h>
+#include <libutil.h>
#include <unistd.h>
#include <assert.h>
#include <pthread.h>
@@ -68,6 +71,7 @@
#include <libxo/xo.h>
#endif
+#include <dev/vmm/vmm_mem.h>
#include <vmmapi.h>
#include "acpi.h"
@@ -108,6 +112,9 @@
static cpuset_t cpumask;
+static struct vm_mem_domain guest_domains[VM_MAXMEMDOM];
+static int guest_ndomains = 0;
+
static void vm_loop(struct vmctx *ctx, struct vcpu *vcpu);
static struct vcpu_info {
@@ -179,6 +186,118 @@
return (lval);
}
+int
+bhyve_numa_parse(const char *opt)
+{
+ int id = -1;
+ nvlist_t *nvl;
+ char *cp, *str, *tofree;
+ char pathbuf[64] = { 0 };
+ char *size = NULL, *cpus = NULL, *domain_policy = NULL;
+
+ if (*opt == '\0') {
+ return (-1);
+ }
+
+ tofree = str = strdup(opt);
+ if (str == NULL)
+ errx(4, "Failed to allocate memory");
+
+ while ((cp = strsep(&str, ",")) != NULL) {
+ if (strncmp(cp, "id=", strlen("id=")) == 0)
+ id = parse_int_value("id", cp + strlen("id="), 0,
+ UINT8_MAX);
+ else if (strncmp(cp, "size=", strlen("size=")) == 0)
+ size = cp + strlen("size=");
+ else if (strncmp(cp,
+ "domain_policy=", strlen("domain_policy=")) == 0)
+ domain_policy = cp + strlen("domain_policy=");
+ else if (strncmp(cp, "cpus=", strlen("cpus=")) == 0)
+ cpus = cp + strlen("cpus=");
+ }
+
+ if (id == -1) {
+ EPRINTLN("Missing NUMA domain ID in '%s'", opt);
+ goto out;
+ }
+
+ snprintf(pathbuf, sizeof(pathbuf), "domains.%d", id);
+ nvl = find_config_node(pathbuf);
+ if (nvl == NULL)
+ nvl = create_config_node(pathbuf);
+ if (size != NULL)
+ set_config_value_node(nvl, "size", size);
+ if (domain_policy != NULL)
+ set_config_value_node(nvl, "domain_policy", domain_policy);
+ if (cpus != NULL)
+ set_config_value_node(nvl, "cpus", cpus);
+
+ free(tofree);
+ return (0);
+
+out:
+ free(tofree);
+ return (-1);
+}
+
+static void
+calc_mem_affinity(size_t vm_memsize)
+{
+ int i;
+ nvlist_t *nvl;
+ bool need_recalc;
+ const char *value;
+ struct vm_mem_domain *dom;
+ char pathbuf[64] = { 0 };
+
+ need_recalc = false;
+ for (i = 0; i < VM_MAXMEMDOM; i++) {
+ dom = &guest_domains[i];
+ snprintf(pathbuf, sizeof(pathbuf), "domains.%d", i);
+ nvl = find_config_node(pathbuf);
+ if (nvl == NULL) {
+ break;
+ }
+
+ value = get_config_value_node(nvl, "size");
+ need_recalc |= value == NULL;
+ if (value != NULL && vm_parse_memsize(value, &dom->size)) {
+ errx(EX_USAGE, "invalid memsize for domain %d: '%s'", i,
+ value);
+ }
+
+ dom->ds_mask = calloc(1, sizeof(domainset_t));
+ if (dom->ds_mask == NULL) {
+ errx(EX_OSERR, "Failed to allocate domainset mask");
+ }
+ dom->ds_size = sizeof(domainset_t);
+ value = get_config_value_node(nvl, "domain_policy");
+ if (value == NULL) {
+ dom->ds_policy = DOMAINSET_POLICY_INVALID;
+ DOMAINSET_ZERO(dom->ds_mask);
+ } else if (domainset_parselist(value, dom->ds_mask, &dom->ds_policy) !=
+ CPUSET_PARSE_OK) {
+ errx(EX_USAGE, "failed to parse domain policy '%s'", value);
+ }
+ }
+
+ guest_ndomains = i;
+ if (guest_ndomains == 0) {
+ /*
+ * No domains were specified - create domain
+ * 0 holding all CPUs and memory.
+ */
+ guest_ndomains = 1;
+ guest_domains[0].size = vm_memsize;
+ } else if (need_recalc) {
+ warnx("At least one domain memory size was not specified, distributing"
+ " total VM memory size across all domains");
+ for (i = 0; i < guest_ndomains; i++) {
+ guest_domains[i].size = vm_memsize / guest_ndomains;
+ }
+ }
+}
+
/*
* Set the sockets, cores, threads, and guest_cpus variables based on
* the configured topology.
@@ -340,6 +459,56 @@
}
}
+static void
+set_vcpu_affinities(void)
+{
+ int cpu, error;
+ nvlist_t *nvl = NULL;
+ cpuset_t cpus;
+ const char *value;
+ char pathbuf[64] = { 0 };
+
+ for (int dom = 0; dom < guest_ndomains; dom++) {
+ snprintf(pathbuf, sizeof(pathbuf), "domains.%d", dom);
+ nvl = find_config_node(pathbuf);
+ if (nvl == NULL)
+ break;
+
+ value = get_config_value_node(nvl, "cpus");
+ if (value == NULL) {
+ EPRINTLN("Missing CPU set for domain %d", dom);
+ exit(4);
+ }
+
+ parse_cpuset(dom, value, &cpus);
+ CPU_FOREACH_ISSET(cpu, &cpus) {
+ error = acpi_add_vcpu_affinity(cpu, dom);
+ if (error) {
+ EPRINTLN(
+ "Unable to set vCPU %d affinity for domain %d: %s",
+ cpu, dom, strerror(errno));
+ exit(4);
+ }
+ }
+ }
+ if (guest_ndomains > 1 || nvl != NULL)
+ return;
+
+ /*
+ * If we're dealing with one domain and no cpuset was provided, create a
+ * default one holding all cpus.
+ */
+ for (cpu = 0; cpu < guest_ncpus; cpu++) {
+ error = acpi_add_vcpu_affinity(cpu, 0);
+ if (error) {
+ EPRINTLN(
+ "Unable to set vCPU %d affinity for domain %d: %s",
+ cpu, 0, strerror(errno));
+ exit(4);
+ }
+ }
+}
+
void *
paddr_guest2host(struct vmctx *ctx, uintptr_t gaddr, size_t len)
{
@@ -713,18 +882,21 @@
vcpu_info[vcpuid].vcpu = vm_vcpu_open(ctx, vcpuid);
}
+ calc_mem_affinity(memsize);
memflags = 0;
if (get_config_bool_default("memory.wired", false))
memflags |= VM_MEM_F_WIRED;
if (get_config_bool_default("memory.guest_in_core", false))
memflags |= VM_MEM_F_INCORE;
vm_set_memflags(ctx, memflags);
- error = vm_setup_memory(ctx, memsize, VM_MMAP_ALL);
+ error = vm_setup_memory_domains(ctx, VM_MMAP_ALL, guest_domains,
+ guest_ndomains);
if (error) {
fprintf(stderr, "Unable to setup memory (%d)\n", errno);
exit(4);
}
+ set_vcpu_affinities();
init_mem(guest_ncpus);
init_bootrom(ctx);
if (bhyve_init_platform(ctx, bsp) != 0)
diff --git a/usr.sbin/bhyve/bootrom.c b/usr.sbin/bhyve/bootrom.c
--- a/usr.sbin/bhyve/bootrom.c
+++ b/usr.sbin/bhyve/bootrom.c
@@ -31,6 +31,7 @@
#include <sys/mman.h>
#include <sys/stat.h>
+#include <dev/vmm/vmm_mem.h>
#include <machine/vmm.h>
#include <err.h>
diff --git a/usr.sbin/bhyve/pci_emul.c b/usr.sbin/bhyve/pci_emul.c
--- a/usr.sbin/bhyve/pci_emul.c
+++ b/usr.sbin/bhyve/pci_emul.c
@@ -42,6 +42,7 @@
#include <stdbool.h>
#include <sysexits.h>
+#include <dev/vmm/vmm_mem.h>
#include <machine/vmm.h>
#include <machine/vmm_snapshot.h>
#include <vmmapi.h>
diff --git a/usr.sbin/bhyve/pci_fbuf.c b/usr.sbin/bhyve/pci_fbuf.c
--- a/usr.sbin/bhyve/pci_fbuf.c
+++ b/usr.sbin/bhyve/pci_fbuf.c
@@ -29,6 +29,7 @@
#include <sys/types.h>
#include <sys/mman.h>
+#include <dev/vmm/vmm_mem.h>
#include <machine/vmm.h>
#include <machine/vmm_snapshot.h>
#include <vmmapi.h>
diff --git a/usr.sbin/bhyve/pci_passthru.c b/usr.sbin/bhyve/pci_passthru.c
--- a/usr.sbin/bhyve/pci_passthru.c
+++ b/usr.sbin/bhyve/pci_passthru.c
@@ -38,6 +38,7 @@
#include <dev/io/iodev.h>
#include <dev/pci/pcireg.h>
+#include <dev/vmm/vmm_mem.h>
#include <vm/vm.h>
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Sun, Jan 18, 4:29 AM (11 h, 45 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
27704491
Default Alt Text
D44567.diff (16 KB)
Attached To
Mode
D44567: bhyve: Add support for specifying VM NUMA configuration
Attached
Detach File
Event Timeline
Log In to Comment